1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_FRAMEWORK_ALLOCATOR_H_ 17 #define TENSORFLOW_FRAMEWORK_ALLOCATOR_H_ 18 19 #include <stdlib.h> 20 21 #include <limits> 22 23 #include "tensorflow/core/framework/numeric_types.h" 24 #include "tensorflow/core/framework/resource_handle.h" 25 #include "tensorflow/core/framework/type_traits.h" 26 #include "tensorflow/core/framework/variant.h" 27 #include "tensorflow/core/platform/logging.h" 28 #include "tensorflow/core/platform/types.h" 29 30 namespace tensorflow { 31 32 // Attributes for a single allocation call. Different calls to the same 33 // allocator could potentially have different allocation attributes. 34 struct AllocationAttributes { 35 // If the first attempt to allocate the memory fails, the allocation 36 // should return immediately without retrying. 37 // An example use case is optional scratch spaces where a failure 38 // has only performance impact. 39 bool no_retry_on_failure = false; 40 // If a Tensor is allocated without the following set to true, then 41 // it is logged as an unknown allocation. During execution Tensors 42 // should be allocated through the OpKernelContext which records 43 // which Op is performing the allocation, and sets this flag to 44 // true. 45 bool allocation_will_be_logged = false; 46 }; 47 48 // Runtime statistics collected by an allocator. 49 struct AllocatorStats { 50 int64 num_allocs; // Number of allocations. 51 int64 bytes_in_use; // Number of bytes in use. 52 int64 max_bytes_in_use; // The maximum bytes in use. 53 int64 max_alloc_size; // The max single allocation seen. 54 55 // The upper limit what the allocator can allocate, if such a limit 56 // is known. Certain allocator may return 0 to indicate the limit is 57 // unknown. 58 int64 bytes_limit; 59 60 AllocatorStats() { Clear(); } 61 62 void Clear(); 63 string DebugString() const; 64 }; 65 66 // Allocator is an abstract interface for allocating and deallocating 67 // device memory. 68 class Allocator { 69 public: 70 #ifdef EIGEN_VECTORIZE_AVX512 71 // Align to 64 byte boundary. 72 static constexpr size_t kAllocatorAlignment = 64; 73 #else 74 // Align to 32 byte boundary. 75 static constexpr size_t kAllocatorAlignment = 32; 76 #endif 77 78 virtual ~Allocator(); 79 80 // Return a string identifying this allocator 81 virtual string Name() = 0; 82 83 // Return an uninitialized block of memory that is "num_bytes" bytes 84 // in size. The returned pointer is guaranteed to be aligned to a 85 // multiple of "alignment" bytes. 86 // REQUIRES: "alignment" is a power of 2. 87 virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; 88 89 // Return an uninitialized block of memory that is "num_bytes" bytes 90 // in size with specified allocation attributes. The returned pointer is 91 // guaranteed to be aligned to a multiple of "alignment" bytes. 92 // REQUIRES: "alignment" is a power of 2. 93 virtual void* AllocateRaw(size_t alignment, size_t num_bytes, 94 const AllocationAttributes& allocation_attr) { 95 // The default behavior is to use the implementation without any allocation 96 // attributes. 97 return AllocateRaw(alignment, num_bytes); 98 } 99 100 // Deallocate a block of memory pointer to by "ptr" 101 // REQUIRES: "ptr" was previously returned by a call to AllocateRaw 102 virtual void DeallocateRaw(void* ptr) = 0; 103 104 // Convenience functions to do typed allocation. C++ constructors 105 // and destructors are invoked for complex types if necessary, 106 // depending on the concrete Allocator implementation. May return 107 // NULL if the tensor has too many elements to represent in a single 108 // allocation. 109 template <typename T> 110 T* Allocate(size_t num_elements) { 111 return Allocate<T>(num_elements, AllocationAttributes()); 112 } 113 114 template <typename T> 115 T* Allocate(size_t num_elements, 116 const AllocationAttributes& allocation_attr) { 117 // TODO(jeff): Do we need to allow clients to pass in alignment 118 // requirements? 119 120 if (num_elements > (std::numeric_limits<size_t>::max() / sizeof(T))) { 121 return NULL; 122 } 123 124 void* p = AllocateRaw(kAllocatorAlignment, sizeof(T) * num_elements, 125 allocation_attr); 126 T* typed_p = reinterpret_cast<T*>(p); 127 if (typed_p) RunCtor<T>(typed_p, num_elements); 128 return typed_p; 129 } 130 131 template <typename T> 132 void Deallocate(T* ptr, size_t num_elements) { 133 if (ptr) { 134 RunDtor<T>(ptr, num_elements); 135 DeallocateRaw(ptr); 136 } 137 } 138 139 // Returns true if this allocator tracks the sizes of allocations. 140 // RequestedSize and AllocatedSize must be overridden if 141 // TracksAllocationSizes is overridden to return true. 142 virtual bool TracksAllocationSizes() { return false; } 143 144 // Returns true if this allocator requires tensors with 0 elements 145 // to allocate buffers. This is false for most allocators, but may 146 // be used by special-case allocators that want to track tensor 147 // usage. 148 virtual bool ShouldAllocateEmptyTensors() { return false; } 149 150 // Returns the user-requested size of the data allocated at 151 // 'ptr'. Note that the actual buffer allocated might be larger 152 // than requested, but this function returns the size requested by 153 // the user. 154 // 155 // REQUIRES: TracksAllocationSizes() is true. 156 // 157 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 158 // allocated by this allocator. 159 virtual size_t RequestedSize(const void* ptr) { 160 CHECK(false) << "allocator doesn't track sizes"; 161 return size_t(0); 162 } 163 164 // Returns the allocated size of the buffer at 'ptr' if known, 165 // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is 166 // guaranteed to be >= RequestedSize(ptr). 167 // 168 // REQUIRES: TracksAllocationSizes() is true. 169 // 170 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 171 // allocated by this allocator. 172 virtual size_t AllocatedSize(const void* ptr) { return RequestedSize(ptr); } 173 174 // Returns either 0 or an identifier assigned to the buffer at 'ptr' 175 // when the buffer was returned by AllocateRaw. If non-zero, the 176 // identifier differs from every other ID assigned by this 177 // allocator. 178 // 179 // REQUIRES: TracksAllocationSizes() is true. 180 // 181 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 182 // allocated by this allocator. 183 virtual int64 AllocationId(const void* ptr) { return 0; } 184 185 // Returns the allocated size of the buffer at 'ptr' if known, 186 // otherwise returns 0. This method can be called when 187 // TracksAllocationSizes() is false, but can be extremely slow. 188 // 189 // REQUIRES: 'ptr!=nullptr' and points to a buffer previously 190 // allocated by this allocator. 191 virtual size_t AllocatedSizeSlow(const void* ptr) { 192 if (TracksAllocationSizes()) { 193 return AllocatedSize(ptr); 194 } 195 return 0; 196 } 197 198 // Fills in 'stats' with statistics collected by this allocator. 199 virtual void GetStats(AllocatorStats* stats) { stats->Clear(); } 200 201 // Clears the internal stats except for the `in_use` field. 202 virtual void ClearStats() {} 203 204 private: 205 // No constructors or destructors are run for simple types 206 template <typename T> 207 void RunCtor(T* p, size_t n) { 208 static_assert(is_simple_type<T>::value, "T is not a simple type."); 209 } 210 211 template <typename T> 212 void RunDtor(T* p, size_t n) {} 213 214 // custom constructors and destructors that can be overridden for 215 // non-standard allocators 216 217 // Runs string's default constructor for p[0], p[1], ..., p[n-1]. 218 virtual void RunStringCtor(string* p, size_t n) { 219 for (size_t i = 0; i < n; ++p, ++i) new (p) string(); 220 } 221 222 // Runs string's default destructor for p[0], p[1], ..., p[n-1]. 223 virtual void RunStringDtor(string* p, size_t n) { 224 for (size_t i = 0; i < n; ++p, ++i) p->~string(); 225 } 226 227 virtual void RunResourceCtor(ResourceHandle* p, size_t n) { 228 for (size_t i = 0; i < n; ++p, ++i) new (p) ResourceHandle(); 229 } 230 231 // Runs string's default destructor for p[0], p[1], ..., p[n-1]. 232 virtual void RunResourceDtor(ResourceHandle* p, size_t n) { 233 for (size_t i = 0; i < n; ++p, ++i) p->~ResourceHandle(); 234 } 235 236 virtual void RunVariantCtor(Variant* p, size_t n) { 237 for (size_t i = 0; i < n; ++p, ++i) new (p) Variant(); 238 } 239 240 virtual void RunVariantDtor(Variant* p, size_t n) { 241 for (size_t i = 0; i < n; ++p, ++i) p->~Variant(); 242 } 243 244 // TODO(jeff): Maybe provide some interface to give info about 245 // current allocation state (total number of bytes available for 246 // allocation, number of bytes free on device, etc.) 247 }; 248 249 // Allocator-specific constructors and destructors are used for 250 // strings 251 template <> 252 inline void Allocator::RunCtor(string* p, size_t n) { 253 RunStringCtor(p, n); 254 } 255 256 template <> 257 inline void Allocator::RunDtor(string* p, size_t n) { 258 RunStringDtor(p, n); 259 } 260 261 template <> 262 inline void Allocator::RunCtor(ResourceHandle* p, size_t n) { 263 RunResourceCtor(p, n); 264 } 265 266 template <> 267 inline void Allocator::RunDtor(ResourceHandle* p, size_t n) { 268 RunResourceDtor(p, n); 269 } 270 271 template <> 272 inline void Allocator::RunCtor(Variant* p, size_t n) { 273 RunVariantCtor(p, n); 274 } 275 276 template <> 277 inline void Allocator::RunDtor(Variant* p, size_t n) { 278 RunVariantDtor(p, n); 279 } 280 281 // An implementation of Allocator that delegates all calls to another Allocator. 282 // 283 // Useful to clients who want to override part of the functionality of another 284 // allocator. 285 class AllocatorWrapper : public Allocator { 286 public: 287 explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} 288 289 ~AllocatorWrapper() override {} 290 291 // Returns the wrapped allocator to which all calls are delegated. 292 Allocator* wrapped() const { return wrapped_; } 293 294 string Name() override { return wrapped_->Name(); } 295 296 void* AllocateRaw(size_t alignment, size_t num_bytes) override { 297 return wrapped_->AllocateRaw(alignment, num_bytes); 298 } 299 300 void* AllocateRaw(size_t alignment, size_t num_bytes, 301 const AllocationAttributes& allocation_attr) override { 302 return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); 303 } 304 305 void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } 306 307 bool TracksAllocationSizes() override { 308 return wrapped_->TracksAllocationSizes(); 309 } 310 311 bool ShouldAllocateEmptyTensors() override { 312 return wrapped_->TracksAllocationSizes(); 313 } 314 315 size_t RequestedSize(const void* ptr) override { 316 return wrapped_->RequestedSize(ptr); 317 } 318 319 size_t AllocatedSize(const void* ptr) override { 320 return wrapped_->AllocatedSize(ptr); 321 } 322 323 int64 AllocationId(const void* ptr) override { 324 return wrapped_->AllocationId(ptr); 325 } 326 327 size_t AllocatedSizeSlow(const void* ptr) override { 328 return wrapped_->AllocatedSizeSlow(ptr); 329 } 330 331 private: 332 Allocator* const wrapped_; 333 }; 334 335 // A tensorflow Op may need access to different kinds of memory that 336 // are not simply a function of the device to which the Op has been 337 // assigned. For example, an Op executing on a GPU may still need 338 // to allocate CPU RAM for some purpose. Internal to the tensorflow 339 // runtime we may choose to allocate CPU ram from special regions 340 // that have been prepared for higher performance in some use 341 // contexts, e.g. doing DMA with particular devices. For these 342 // reasons, the Device interface does not expose just one memory 343 // Allocator, but instead provides an accessor that takes a 344 // specification of the desired memory attributes in order to select 345 // an Allocator. 346 // 347 // Example use: 348 // // Allocator for ordinary device memory: 349 // Allocator* a = allocator(AllocatorAttributes()); 350 // ... 351 // // Allocator for CPU RAM, regardless of where Op is executing: 352 // AllocatorAttributes attr; 353 // attr.set_on_host(true); 354 // Allocator* a = allocator(attr); 355 struct AllocatorAttributes { 356 void set_on_host(bool v) { value |= (static_cast<int>(v)); } 357 bool on_host() const { return value & 0x1; } 358 void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } 359 bool nic_compatible() const { return value & (0x1 << 1); } 360 void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } 361 bool gpu_compatible() const { return value & (0x1 << 2); } 362 void Merge(AllocatorAttributes other) { value |= other.value; } 363 // Returns true if the fields set in *this is a subset of or equal to 364 // those set in other. 365 bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { 366 return (value | other.value) == other.value; 367 } 368 369 // NOTE: The upper 8 bits of the value are reserved for 370 // device-specific uses. Implementors of a device can interpret these 371 // upper 8 bits in device-specific ways, and ops implemented for those 372 // devices are responsible for setting those 8 bits appropriately. 373 uint32 value = 0; 374 }; 375 376 // Returns a trivial implementation of Allocator which uses the system 377 // default malloc. The returned allocator is a process singleton. 378 Allocator* cpu_allocator(); 379 380 // If 'enable' is true, the process-wide cpu allocator collects 381 // AllocatorStats. By default, it's disabled. 382 void EnableCPUAllocatorStats(bool enable); 383 384 // If 'enable' is true, the process-wide cpu allocator collects full 385 // statistics. By default, it's disabled. 386 void EnableCPUAllocatorFullStats(bool enable); 387 388 // Abstract interface of an object that does the underlying suballoc/free of 389 // memory for a higher-level allocator. 390 class SubAllocator { 391 public: 392 virtual ~SubAllocator() {} 393 virtual void* Alloc(size_t alignment, size_t num_bytes) = 0; 394 virtual void Free(void* ptr, size_t num_bytes) = 0; 395 }; 396 397 } // namespace tensorflow 398 399 #endif // TENSORFLOW_FRAMEWORK_ALLOCATOR_H_ 400