1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 17 #define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 18 19 #include <array> 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <vector> 24 25 #include "tensorflow/core/common_runtime/allocator_retry.h" 26 #include "tensorflow/core/common_runtime/visitable_allocator.h" 27 #include "tensorflow/core/lib/gtl/stl_util.h" 28 #include "tensorflow/core/lib/strings/strcat.h" 29 #include "tensorflow/core/platform/macros.h" 30 #include "tensorflow/core/platform/mutex.h" 31 #include "tensorflow/core/platform/thread_annotations.h" 32 #include "tensorflow/core/platform/types.h" 33 #include "tensorflow/core/protobuf/config.pb.h" 34 35 namespace tensorflow { 36 37 // A memory allocator that implements a 'best-fit with coalescing' 38 // algorithm. This is essentially a very simple version of Doug Lea's 39 // malloc (dlmalloc). 40 // 41 // The goal of this allocator is to support defragmentation via 42 // coalescing. One assumption we make is that the process using this 43 // allocator owns pretty much all of the memory, and that nearly 44 // all requests to allocate memory go through this interface. 45 class BFCAllocator : public VisitableAllocator { 46 public: 47 // Takes ownership of sub_allocator. 48 BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, 49 bool allow_growth, const string& name); 50 ~BFCAllocator() override; 51 52 string Name() override { return name_; } 53 void* AllocateRaw(size_t alignment, size_t num_bytes) override; 54 void* AllocateRaw(size_t alignment, size_t num_bytes, 55 const AllocationAttributes& allocation_attr) override; 56 void DeallocateRaw(void* ptr) override; 57 58 void AddAllocVisitor(Visitor visitor) override; 59 60 // Does nothing, because memory is never freed. 61 void AddFreeVisitor(Visitor visitor) override {} 62 63 bool TracksAllocationSizes() override; 64 65 size_t RequestedSize(const void* ptr) override; 66 67 size_t AllocatedSize(const void* ptr) override; 68 69 int64 AllocationId(const void* ptr) override; 70 71 void GetStats(AllocatorStats* stats) override; 72 73 void ClearStats() override; 74 75 private: 76 struct Bin; 77 78 void* AllocateRawInternal(size_t alignment, size_t num_bytes, 79 bool dump_log_on_failure); 80 void DeallocateRawInternal(void* ptr); 81 82 // A ChunkHandle is an index into the chunks_ vector in BFCAllocator 83 // kInvalidChunkHandle means an invalid chunk 84 typedef size_t ChunkHandle; 85 static const int kInvalidChunkHandle = -1; 86 87 typedef int BinNum; 88 static const int kInvalidBinNum = -1; 89 static const int kNumBins = 21; 90 91 // Chunks point to memory. Their prev/next pointers form a 92 // doubly-linked list of addresses sorted by base address that 93 // must be contiguous. Chunks contain information about whether 94 // they are in use or whether they are free, and contain a pointer 95 // to the bin they are in. 96 struct Chunk { 97 size_t size = 0; // Full size of buffer. 98 99 // We sometimes give chunks that are larger than needed to reduce 100 // fragmentation. requested_size keeps track of what the client 101 // actually wanted so we can understand whether our splitting 102 // strategy is efficient. 103 size_t requested_size = 0; 104 105 // allocation_id is set to -1 when the chunk is not in use. It is assigned a 106 // value greater than zero before the chunk is returned from 107 // AllocateRaw, and this value is unique among values assigned by 108 // the parent allocator. 109 int64 allocation_id = -1; 110 void* ptr = nullptr; // pointer to granted subbuffer. 111 112 // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly 113 // preceding the memory used by this chunk. E.g., It should start 114 // at 'ptr - prev->size' 115 ChunkHandle prev = kInvalidChunkHandle; 116 117 // If not kInvalidChunkHandle, the memory referred to by 'next' is directly 118 // following the memory used by this chunk. E.g., It should be at 119 // 'ptr + size' 120 ChunkHandle next = kInvalidChunkHandle; 121 122 // What bin are we in? 123 BinNum bin_num = kInvalidBinNum; 124 125 bool in_use() const { return allocation_id != -1; } 126 127 string DebugString(BFCAllocator* a, 128 bool recurse) NO_THREAD_SAFETY_ANALYSIS { 129 string dbg; 130 strings::StrAppend( 131 &dbg, " Size: ", strings::HumanReadableNumBytes(size), 132 " | Requested Size: ", strings::HumanReadableNumBytes(requested_size), 133 " | in_use: ", in_use()); 134 if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { 135 Chunk* p = a->ChunkFromHandle(prev); 136 strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false)); 137 } 138 if (recurse && next != BFCAllocator::kInvalidChunkHandle) { 139 Chunk* n = a->ChunkFromHandle(next); 140 strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false)); 141 } 142 return dbg; 143 } 144 }; 145 146 // A Bin is a collection of similar-sized free chunks. 147 struct Bin { 148 // All chunks in this bin have >= bin_size memory. 149 size_t bin_size = 0; 150 151 struct ChunkComparator { 152 explicit ChunkComparator(BFCAllocator* allocator) 153 : allocator_(allocator) {} 154 // Sort first by size and then use pointer address as a tie breaker. 155 bool operator()(const ChunkHandle ha, 156 const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS { 157 const Chunk* a = allocator_->ChunkFromHandle(ha); 158 const Chunk* b = allocator_->ChunkFromHandle(hb); 159 if (a->size != b->size) { 160 return a->size < b->size; 161 } 162 return a->ptr < b->ptr; 163 } 164 165 private: 166 BFCAllocator* allocator_; // The parent allocator 167 }; 168 169 typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; 170 // List of free chunks within the bin, sorted by chunk size. 171 // Chunk * not owned. 172 FreeChunkSet free_chunks; 173 Bin(BFCAllocator* allocator, size_t bs) 174 : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} 175 }; 176 177 static const size_t kMinAllocationBits = 8; 178 static const size_t kMinAllocationSize = 1 << kMinAllocationBits; 179 180 // AllocationRegion maps pointers to ChunkHandles for a single 181 // contiguous memory region. 182 // 183 // This class is thread-compatible. 184 class AllocationRegion { 185 public: 186 AllocationRegion(void* ptr, size_t memory_size) 187 : ptr_(ptr), 188 memory_size_(memory_size), 189 end_ptr_( 190 static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { 191 DCHECK_EQ(0, memory_size % kMinAllocationSize); 192 const size_t n_handles = 193 (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; 194 handles_ = new ChunkHandle[n_handles]; 195 for (size_t i = 0; i < n_handles; i++) { 196 handles_[i] = kInvalidChunkHandle; 197 } 198 } 199 200 AllocationRegion() {} 201 202 ~AllocationRegion() { delete[] handles_; } 203 204 AllocationRegion(AllocationRegion&& other) { Swap(other); } 205 206 AllocationRegion& operator=(AllocationRegion&& other) { 207 Swap(other); 208 return *this; 209 } 210 211 void* ptr() const { return ptr_; } 212 void* end_ptr() const { return end_ptr_; } 213 size_t memory_size() const { return memory_size_; } 214 ChunkHandle get_handle(const void* p) const { 215 return handles_[IndexFor(p)]; 216 } 217 void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } 218 void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } 219 220 private: 221 void Swap(AllocationRegion& other) { 222 std::swap(ptr_, other.ptr_); 223 std::swap(memory_size_, other.memory_size_); 224 std::swap(end_ptr_, other.end_ptr_); 225 std::swap(handles_, other.handles_); 226 } 227 228 int IndexFor(const void* p) const { 229 std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); 230 std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); 231 DCHECK_GE(p_int, base_int); 232 DCHECK_LT(p_int, base_int + memory_size_); 233 return static_cast<int>(((p_int - base_int) >> kMinAllocationBits)); 234 } 235 236 // Metadata about the allocation region. 237 void* ptr_ = nullptr; 238 size_t memory_size_ = 0; 239 void* end_ptr_ = nullptr; 240 241 // Array of size "memory_size / kMinAllocationSize". It is 242 // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle 243 // for the memory allocation represented by "p" 244 ChunkHandle* handles_ = nullptr; 245 246 TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); 247 }; 248 249 // RegionManager aggregates one or more "AllocationRegions" and provides 250 // a layer of indirection from pointers to the underlying ChunkHandle, 251 // allowing allocation across multiple discontiguous memory regions. 252 // 253 // This class is thread-compatible. 254 class RegionManager { 255 public: 256 RegionManager() {} 257 ~RegionManager() {} 258 259 void AddAllocationRegion(void* ptr, size_t memory_size) { 260 // Insert sorted by end_ptr 261 auto entry = 262 std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); 263 regions_.insert(entry, AllocationRegion(ptr, memory_size)); 264 } 265 266 ChunkHandle get_handle(const void* p) const { 267 return RegionFor(p)->get_handle(p); 268 } 269 270 void set_handle(const void* p, ChunkHandle h) { 271 return MutableRegionFor(p)->set_handle(p, h); 272 } 273 void erase(const void* p) { return MutableRegionFor(p)->erase(p); } 274 275 const std::vector<AllocationRegion>& regions() const { return regions_; } 276 277 private: 278 static bool Comparator(const void* ptr, const AllocationRegion& other) { 279 return ptr < other.end_ptr(); 280 } 281 282 AllocationRegion* MutableRegionFor(const void* p) { 283 return const_cast<AllocationRegion*>(RegionFor(p)); 284 } 285 286 const AllocationRegion* RegionFor(const void* p) const { 287 auto entry = 288 std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); 289 290 if (entry != regions_.end()) { 291 return &(*entry); 292 } 293 294 LOG(FATAL) << "Could not find Region for " << p; 295 return nullptr; 296 } 297 298 private: 299 std::vector<AllocationRegion> regions_; 300 }; 301 302 // Returns 'bytes' rounded up to the next highest kMinAllocationSize. 303 size_t RoundedBytes(size_t bytes); 304 305 // Try to add a new memory region that can satisfy an allocation of 306 // 'rounded_bytes' bytes. Returns true on success and false on 307 // failure. 308 bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); 309 310 // Returns a pointer to an underlying allocated chunk of size 311 // 'rounded_bytes'. 312 void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes) 313 EXCLUSIVE_LOCKS_REQUIRED(lock_); 314 315 // Splits the chunk specified by 'h' into two chunks, one at least 316 // of size 'num_bytes'. 317 void SplitChunk(ChunkHandle h, size_t num_bytes) 318 EXCLUSIVE_LOCKS_REQUIRED(lock_); 319 320 // Merges the two chunk handles. Requires that the chunks are 321 // contiguous in their allocation. 322 void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_); 323 324 // Frees the memory represented by 'h', coalescing the chunk if 325 // possible. 326 void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 327 328 // Adds the chunk 'h' to the proper free bin. 329 void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 330 331 // Removes the free chunk pointed to by 'c' from the set free_chunks. 332 void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, 333 const Bin::FreeChunkSet::iterator& c) 334 EXCLUSIVE_LOCKS_REQUIRED(lock_); 335 336 // Removes a free chunk from the bin. 337 void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 338 339 // Removes the chunk metadata represented by 'h'. 340 void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 341 342 string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_); 343 void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); 344 345 ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_); 346 void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 347 348 Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_); 349 350 // Information about a Bin that is useful for debugging. 351 struct BinDebugInfo { 352 size_t total_bytes_in_use = 0; 353 size_t total_bytes_in_bin = 0; 354 size_t total_requested_bytes_in_use = 0; 355 size_t total_chunks_in_use = 0; 356 size_t total_chunks_in_bin = 0; 357 }; 358 359 // Computes and returns a BinDebugInfo for each Bin. 360 std::array<BinDebugInfo, kNumBins> get_bin_debug_info() 361 EXCLUSIVE_LOCKS_REQUIRED(lock_); 362 363 AllocatorRetry retry_helper_; 364 365 // Structures immutable after construction 366 size_t memory_limit_ = 0; 367 368 inline int Log2FloorNonZeroSlow(uint64 n) { 369 int r = 0; 370 while (n > 0) { 371 r++; 372 n >>= 1; 373 } 374 return r - 1; 375 } 376 377 // Returns floor(log2(n)). 378 inline int Log2FloorNonZero(uint64 n) { 379 #if defined(__GNUC__) 380 return 63 ^ __builtin_clzll(n); 381 #elif defined(PLATFORM_WINDOWS) 382 unsigned long index; 383 _BitScanReverse64(&index, n); 384 return index; 385 #else 386 return Log2FloorNonZeroSlow(n); 387 #endif 388 } 389 390 // Map from bin size to Bin 391 Bin* BinFromIndex(BinNum index) { 392 return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); 393 } 394 size_t BinNumToSize(BinNum index) { 395 return static_cast<size_t>(256) << index; 396 } 397 BinNum BinNumForSize(size_t bytes) { 398 uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; 399 int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); 400 return b; 401 } 402 Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } 403 404 char bins_space_[sizeof(Bin) * kNumBins]; 405 406 // The size of the current region allocation. 407 size_t curr_region_allocation_bytes_; 408 409 // The total number of allocated bytes by the allocator. 410 size_t total_region_allocated_bytes_ = 0; 411 412 // An indicator that expansion of a region has hit the limits 413 // of the available memory. 414 bool started_backpedal_ = false; 415 416 std::unique_ptr<SubAllocator> suballocator_; 417 string name_; 418 419 // Structures mutable after construction 420 mutable mutex lock_; 421 RegionManager region_manager_ GUARDED_BY(lock_); 422 423 std::vector<Chunk> chunks_ GUARDED_BY(lock_); 424 425 // Pointer to head of linked list of free Chunks 426 ChunkHandle free_chunks_list_ GUARDED_BY(lock_); 427 428 // Called once on each region, ASAP. 429 std::vector<Visitor> region_visitors_ GUARDED_BY(lock_); 430 431 // Counter containing the next unique identifier to assign to a 432 // newly-created chunk. 433 int64 next_allocation_id_ GUARDED_BY(lock_); 434 435 // Stats. 436 AllocatorStats stats_ GUARDED_BY(lock_); 437 438 friend class GPUBFCAllocatorPrivateMethodsTest; 439 TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); 440 }; 441 442 } // namespace tensorflow 443 444 #endif // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_ 445