1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Suite of types that represent device memory allocations. These are 17 // allocated by the StreamExecutor interface, which produces values appropriate 18 // for the underlying platform (whether it be CUDA or OpenCL). 19 // 20 // The untyped base class (like a device void*) is DeviceMemoryBase, which can 21 // be specialized for a given allocation type (like a device T*) using 22 // DeviceMemory<T>. 23 24 #ifndef TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 25 #define TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 26 27 #include <stddef.h> 28 29 #include "tensorflow/stream_executor/lib/casts.h" 30 #include "tensorflow/stream_executor/platform/port.h" 31 32 namespace perftools { 33 namespace gputools { 34 35 class StreamExecutor; 36 37 // void*-analogous device memory allocation. For the typed variation, see 38 // DeviceMemory<T>. 39 // 40 // This is effectively a two-tuple of a pointer and size; however, note that the 41 // pointer may not be to the virtual address itself -- in OpenCL the pointer is 42 // to a cl_mem handle that describes the device allocation. Therefore, 43 // DeviceMemoryBase::opaque does not necessarily produce a pointer that can be 44 // referenced directly, so use it with caution. 45 // 46 // Thread-compatible. 47 class DeviceMemoryBase { 48 public: 49 // Default constructor instantiates a null-pointed, zero-sized device memory 50 // region. An opaque pointer may be provided -- see header for details on the 51 // opacity of that pointer. 52 explicit DeviceMemoryBase(void *opaque = nullptr, uint64 size = 0, 53 bool is_sub_buffer = false) 54 : opaque_(opaque), size_(size), is_sub_buffer_(is_sub_buffer) {} 55 56 // Returns whether the backing memory is the null pointer. 57 // A `== nullptr` convenience method is also provided. 58 bool is_null() const { return opaque_ == nullptr; } 59 bool operator==(std::nullptr_t other) const { return is_null(); } 60 bool operator!=(std::nullptr_t other) const { return !is_null(); } 61 62 // Provides a partial order between device memory values. 63 // 64 // This operator is provided so that this object can be used as a key in an 65 // ordered map. 66 bool operator<(const DeviceMemoryBase &other) const { 67 return opaque() < other.opaque(); 68 } 69 70 // Returns the size, in bytes, for the backing memory. 71 uint64 size() const { return size_; } 72 73 // Warning: note that the pointer returned is not necessarily directly to 74 // device virtual address space, but is platform-dependent. 75 void *opaque() { return opaque_; } 76 const void *opaque() const { return opaque_; } 77 78 // Returns true if this is an offset into another primary allocation. 79 bool is_sub_buffer() const { return is_sub_buffer_; } 80 81 // Returns whether the two DeviceMemoryBase segments are identical (both in 82 // their opaque pointer and size). 83 bool IsSameAs(const DeviceMemoryBase &other) const { 84 return opaque() == other.opaque() && size() == other.size(); 85 } 86 87 protected: 88 friend class StreamExecutor; 89 90 // Resets the internal values of the opaque pointer and number of bytes in the 91 // memory region, just as in the constructor. 92 void Reset(void *opaque, uint64 bytes) { 93 opaque_ = opaque; 94 size_ = bytes; 95 } 96 97 private: 98 void *opaque_; // Platform-dependent value representing allocated memory. 99 uint64 size_; // Size in bytes of this allocation. 100 bool is_sub_buffer_; // Is this a primary allocation or a sub-buffer? 101 }; 102 103 // Typed wrapper around "void *"-like DeviceMemoryBase. 104 // 105 // For example, DeviceMemory<int> is a simple wrapper around DeviceMemoryBase 106 // that represents one or more integers in Device memory. 107 // 108 // Thread-compatible. 109 template <typename ElemT> 110 class DeviceMemory final : public DeviceMemoryBase { 111 public: 112 // Default constructor instantiates a null-pointed, zero-sized memory region. 113 DeviceMemory() : DeviceMemoryBase(nullptr, 0) {} 114 DeviceMemory(std::nullptr_t) : DeviceMemory() {} 115 116 // Typed device memory regions may be constructed from untyped device memory 117 // regions, this effectively amounts to a cast from a void*. 118 explicit DeviceMemory(const DeviceMemoryBase &other) 119 : DeviceMemoryBase(const_cast<DeviceMemoryBase &>(other).opaque(), 120 other.size(), other.is_sub_buffer()) {} 121 122 // Returns the number of elements of type ElemT that constitute this 123 // allocation. 124 uint64 ElementCount() const { return size() / sizeof(ElemT); } 125 126 // Returns whether this is a single-element allocation. 127 bool IsScalar() const { return ElementCount() == 1; } 128 129 // Create a typed area of DeviceMemory with a given opaque pointer and the 130 // quantity of bytes in the allocation. This function is broken out to 131 // distinguish bytes from an element count. 132 static DeviceMemory<ElemT> MakeFromByteSize(void *opaque, uint64 bytes) { 133 return DeviceMemory<ElemT>(opaque, bytes); 134 } 135 136 // Resets the DeviceMemory data, in MakeFromByteSize fashion. 137 // This simply clobbers the prior values. 138 void ResetFromByteSize(void *opaque, uint64 bytes) { 139 // TODO(leary) when NVCC is eliminated we can add this check (and the 140 // logging include it requires). 141 // CHECK_EQ(0, bytes % sizeof(ElemT)); 142 DeviceMemoryBase::Reset(opaque, bytes); 143 } 144 145 // ------------------------------------------------------------ 146 147 protected: 148 // This constructor is solely used from derived classes; it is made protected 149 // because it accepts a byte-size instead of an element count, which could 150 // potentially be misused given the ElementCount() nature of this interface. 151 // 152 // In order to specify the desire to use byte size instead of element count 153 // explicitly, use MakeFromByteSize. 154 DeviceMemory(void *opaque, uint64 size) : DeviceMemoryBase(opaque, size) {} 155 }; 156 157 // A class to encapsulate the type and size of a dynamic shared memory 158 // buffer. Because the buffer exists solely on the device and is not copyable 159 // to the host, memory objects of this type do not maintain buffer pointers 160 // on the host. 161 template <typename ElemT> 162 class SharedDeviceMemory final : public DeviceMemoryBase { 163 public: 164 explicit SharedDeviceMemory(uint64 elem_count) 165 : DeviceMemoryBase(nullptr, elem_count * kElemSize) {} 166 167 static constexpr size_t kElemSize = sizeof(ElemT); 168 169 // Returns the number of elements of type ElemT that constitute this 170 // allocation. 171 uint64 ElementCount() const { return size() / kElemSize; } 172 173 // Returns whether this is a single-element allocation. 174 bool IsScalar() const { return ElementCount() == 1; } 175 }; 176 177 // Similar to the typed DeviceMemory, but is the unique owner of its 178 // memory, if any. ScopedDeviceMemory is thread-compatible. It is also 179 // movable and uncopyable to represent unique ownership. 180 template <typename ElemT> 181 class ScopedDeviceMemory { 182 public: 183 // Default construction initializes the internal state to nullptr. This 184 // mirrors the std::unique_ptr<> functionality, where default construction 185 // produces a nullptr unique_ptr, which can be assigned later. 186 ScopedDeviceMemory(); 187 188 // Parameters: 189 // parent: Executor used to deallocate memory when this instance goes 190 // out of scope. 191 // value: Already-allocated device memory value for this scoped mechanism to 192 // deallocate. This memory must have been allocated by parent. 193 ScopedDeviceMemory(StreamExecutor *parent, DeviceMemoryBase value); 194 195 // Constructor overload that places a literal array into device memory 196 ScopedDeviceMemory(StreamExecutor *parent, 197 std::initializer_list<ElemT> values); 198 199 // Moves ownership of the memory from other to the constructed 200 // object. 201 // 202 // Postcondition: other == nullptr. 203 ScopedDeviceMemory(ScopedDeviceMemory &&other) noexcept: 204 ScopedDeviceMemory(other.parent_, other.Release()) {} 205 206 // Releases the memory that was provided in the constructor, through the 207 // "parent" StreamExecutor. 208 ~ScopedDeviceMemory(); 209 210 // Moves ownership of the memory from other to this object. 211 // 212 // Postcondition: other == nullptr. 213 ScopedDeviceMemory& operator=(ScopedDeviceMemory &&other) { 214 Reset(other.Release()); 215 parent_ = other.parent_; 216 return *this; 217 } 218 219 // Returns the memory that backs this scoped allocation converted to 220 // DeviceMemory<T> apparent type. This is useful for cases where the 221 // DeviceMemory must be passed by const-ref, as the ScopedDeviceMemory doesn't 222 // allow copying, for scoped-object-lifetime reasons. 223 const DeviceMemory<ElemT> &cref() const { return wrapped_; } 224 225 // Returns a pointer to the DeviceMemory<T> apparent type for use in mutable 226 // operations. The value returned should not be used outside the scope of this 227 // ScopedDeviceMemory object's lifetime. 228 DeviceMemory<ElemT> *ptr() { return &wrapped_; } 229 const DeviceMemory<ElemT> *ptr() const { return &wrapped_; } 230 231 // Smart-pointer-like operators for the wrapped DeviceMemory. 232 // This reference must not be used outside the lifetime of this 233 // ScopedDeviceMemory. 234 const DeviceMemory<ElemT> &operator*() const { return cref(); } 235 DeviceMemory<ElemT> *operator->() { return ptr(); } 236 const DeviceMemory<ElemT> *operator->() const { return ptr(); } 237 bool operator==(std::nullptr_t other) const { return wrapped_.is_null(); } 238 bool operator!=(std::nullptr_t other) const { return !wrapped_.is_null(); } 239 240 // Analogous to std::unique_ptr::reset, frees the existing memory held in 241 // this scoped memory container and replaces it with updated. Ownership 242 // of updated is transferred to this object. 243 void Reset(DeviceMemory<ElemT> updated); 244 void Reset(std::nullptr_t); 245 246 // Analogous to std::unique_ptr::release, releases ownership of the held 247 // memory and transfers it to the caller. 248 // 249 // Postcondition: *this == nullptr 250 DeviceMemory<ElemT> Release() { 251 auto tmp = wrapped_; 252 wrapped_.ResetFromByteSize(nullptr, 0); 253 return tmp; 254 } 255 256 private: 257 DeviceMemory<ElemT> wrapped_; // Value we wrap with scoped-release. 258 StreamExecutor *parent_; // See constructor. 259 260 SE_DISALLOW_COPY_AND_ASSIGN(ScopedDeviceMemory); 261 }; 262 263 // Host-side representation of packed-and-aligned vector datatypes on the device 264 // side. Since these can appear in device kernel signatures, we support 265 // launching them with these datatypes in launch signatures. 266 267 struct Float2 { 268 float x, y; 269 }; 270 271 struct Float4 { 272 Float2 xz, yw; 273 }; 274 275 struct Double2 { 276 double x, y; 277 }; 278 279 static_assert(sizeof(Float2) == 2 * sizeof(float), "Float2 must be packed"); 280 static_assert(sizeof(Float4) == 4 * sizeof(float), "Float4 must be packed"); 281 static_assert(sizeof(Double2) == 2 * sizeof(double), "Double2 must be packed"); 282 283 } // namespace gputools 284 } // namespace perftools 285 286 #endif // TENSORFLOW_STREAM_EXECUTOR_DEVICE_MEMORY_H_ 287