Home | History | Annotate | Download | only in common_runtime
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
     17 #define TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
     18 
     19 #include <array>
     20 #include <memory>
     21 #include <string>
     22 #include <unordered_map>
     23 #include <vector>
     24 
     25 #include "tensorflow/core/common_runtime/allocator_retry.h"
     26 #include "tensorflow/core/common_runtime/visitable_allocator.h"
     27 #include "tensorflow/core/lib/gtl/stl_util.h"
     28 #include "tensorflow/core/lib/strings/strcat.h"
     29 #include "tensorflow/core/platform/macros.h"
     30 #include "tensorflow/core/platform/mutex.h"
     31 #include "tensorflow/core/platform/thread_annotations.h"
     32 #include "tensorflow/core/platform/types.h"
     33 #include "tensorflow/core/protobuf/config.pb.h"
     34 
     35 namespace tensorflow {
     36 
     37 // A memory allocator that implements a 'best-fit with coalescing'
     38 // algorithm.  This is essentially a very simple version of Doug Lea's
     39 // malloc (dlmalloc).
     40 //
     41 // The goal of this allocator is to support defragmentation via
     42 // coalescing.  One assumption we make is that the process using this
     43 // allocator owns pretty much all of the memory, and that nearly
     44 // all requests to allocate memory go through this interface.
     45 class BFCAllocator : public VisitableAllocator {
     46  public:
     47   // Takes ownership of sub_allocator.
     48   BFCAllocator(SubAllocator* sub_allocator, size_t total_memory,
     49                bool allow_growth, const string& name);
     50   ~BFCAllocator() override;
     51 
     52   string Name() override { return name_; }
     53   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
     54   void* AllocateRaw(size_t alignment, size_t num_bytes,
     55                     const AllocationAttributes& allocation_attr) override;
     56   void DeallocateRaw(void* ptr) override;
     57 
     58   void AddAllocVisitor(Visitor visitor) override;
     59 
     60   // Does nothing, because memory is never freed.
     61   void AddFreeVisitor(Visitor visitor) override {}
     62 
     63   bool TracksAllocationSizes() override;
     64 
     65   size_t RequestedSize(const void* ptr) override;
     66 
     67   size_t AllocatedSize(const void* ptr) override;
     68 
     69   int64 AllocationId(const void* ptr) override;
     70 
     71   void GetStats(AllocatorStats* stats) override;
     72 
     73   void ClearStats() override;
     74 
     75  private:
     76   struct Bin;
     77 
     78   void* AllocateRawInternal(size_t alignment, size_t num_bytes,
     79                             bool dump_log_on_failure);
     80   void DeallocateRawInternal(void* ptr);
     81 
     82   // A ChunkHandle is an index into the chunks_ vector in BFCAllocator
     83   // kInvalidChunkHandle means an invalid chunk
     84   typedef size_t ChunkHandle;
     85   static const int kInvalidChunkHandle = -1;
     86 
     87   typedef int BinNum;
     88   static const int kInvalidBinNum = -1;
     89   static const int kNumBins = 21;
     90 
     91   // Chunks point to memory.  Their prev/next pointers form a
     92   // doubly-linked list of addresses sorted by base address that
     93   // must be contiguous.  Chunks contain information about whether
     94   // they are in use or whether they are free, and contain a pointer
     95   // to the bin they are in.
     96   struct Chunk {
     97     size_t size = 0;  // Full size of buffer.
     98 
     99     // We sometimes give chunks that are larger than needed to reduce
    100     // fragmentation.  requested_size keeps track of what the client
    101     // actually wanted so we can understand whether our splitting
    102     // strategy is efficient.
    103     size_t requested_size = 0;
    104 
    105     // allocation_id is set to -1 when the chunk is not in use. It is assigned a
    106     // value greater than zero before the chunk is returned from
    107     // AllocateRaw, and this value is unique among values assigned by
    108     // the parent allocator.
    109     int64 allocation_id = -1;
    110     void* ptr = nullptr;  // pointer to granted subbuffer.
    111 
    112     // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly
    113     // preceding the memory used by this chunk.  E.g., It should start
    114     // at 'ptr - prev->size'
    115     ChunkHandle prev = kInvalidChunkHandle;
    116 
    117     // If not kInvalidChunkHandle, the memory referred to by 'next' is directly
    118     // following the memory used by this chunk.  E.g., It should be at
    119     // 'ptr + size'
    120     ChunkHandle next = kInvalidChunkHandle;
    121 
    122     // What bin are we in?
    123     BinNum bin_num = kInvalidBinNum;
    124 
    125     bool in_use() const { return allocation_id != -1; }
    126 
    127     string DebugString(BFCAllocator* a,
    128                        bool recurse) NO_THREAD_SAFETY_ANALYSIS {
    129       string dbg;
    130       strings::StrAppend(
    131           &dbg, "  Size: ", strings::HumanReadableNumBytes(size),
    132           " | Requested Size: ", strings::HumanReadableNumBytes(requested_size),
    133           " | in_use: ", in_use());
    134       if (recurse && prev != BFCAllocator::kInvalidChunkHandle) {
    135         Chunk* p = a->ChunkFromHandle(prev);
    136         strings::StrAppend(&dbg, ", prev: ", p->DebugString(a, false));
    137       }
    138       if (recurse && next != BFCAllocator::kInvalidChunkHandle) {
    139         Chunk* n = a->ChunkFromHandle(next);
    140         strings::StrAppend(&dbg, ", next: ", n->DebugString(a, false));
    141       }
    142       return dbg;
    143     }
    144   };
    145 
    146   // A Bin is a collection of similar-sized free chunks.
    147   struct Bin {
    148     // All chunks in this bin have >= bin_size memory.
    149     size_t bin_size = 0;
    150 
    151     struct ChunkComparator {
    152       explicit ChunkComparator(BFCAllocator* allocator)
    153           : allocator_(allocator) {}
    154       // Sort first by size and then use pointer address as a tie breaker.
    155       bool operator()(const ChunkHandle ha,
    156                       const ChunkHandle hb) const NO_THREAD_SAFETY_ANALYSIS {
    157         const Chunk* a = allocator_->ChunkFromHandle(ha);
    158         const Chunk* b = allocator_->ChunkFromHandle(hb);
    159         if (a->size != b->size) {
    160           return a->size < b->size;
    161         }
    162         return a->ptr < b->ptr;
    163       }
    164 
    165      private:
    166       BFCAllocator* allocator_;  // The parent allocator
    167     };
    168 
    169     typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet;
    170     // List of free chunks within the bin, sorted by chunk size.
    171     // Chunk * not owned.
    172     FreeChunkSet free_chunks;
    173     Bin(BFCAllocator* allocator, size_t bs)
    174         : bin_size(bs), free_chunks(ChunkComparator(allocator)) {}
    175   };
    176 
    177   static const size_t kMinAllocationBits = 8;
    178   static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
    179 
    180   // AllocationRegion maps pointers to ChunkHandles for a single
    181   // contiguous memory region.
    182   //
    183   // This class is thread-compatible.
    184   class AllocationRegion {
    185    public:
    186     AllocationRegion(void* ptr, size_t memory_size)
    187         : ptr_(ptr),
    188           memory_size_(memory_size),
    189           end_ptr_(
    190               static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) {
    191       DCHECK_EQ(0, memory_size % kMinAllocationSize);
    192       const size_t n_handles =
    193           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
    194       handles_ = new ChunkHandle[n_handles];
    195       for (size_t i = 0; i < n_handles; i++) {
    196         handles_[i] = kInvalidChunkHandle;
    197       }
    198     }
    199 
    200     AllocationRegion() {}
    201 
    202     ~AllocationRegion() { delete[] handles_; }
    203 
    204     AllocationRegion(AllocationRegion&& other) { Swap(other); }
    205 
    206     AllocationRegion& operator=(AllocationRegion&& other) {
    207       Swap(other);
    208       return *this;
    209     }
    210 
    211     void* ptr() const { return ptr_; }
    212     void* end_ptr() const { return end_ptr_; }
    213     size_t memory_size() const { return memory_size_; }
    214     ChunkHandle get_handle(const void* p) const {
    215       return handles_[IndexFor(p)];
    216     }
    217     void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; }
    218     void erase(const void* p) { set_handle(p, kInvalidChunkHandle); }
    219 
    220    private:
    221     void Swap(AllocationRegion& other) {
    222       std::swap(ptr_, other.ptr_);
    223       std::swap(memory_size_, other.memory_size_);
    224       std::swap(end_ptr_, other.end_ptr_);
    225       std::swap(handles_, other.handles_);
    226     }
    227 
    228     int IndexFor(const void* p) const {
    229       std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p);
    230       std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_);
    231       DCHECK_GE(p_int, base_int);
    232       DCHECK_LT(p_int, base_int + memory_size_);
    233       return static_cast<int>(((p_int - base_int) >> kMinAllocationBits));
    234     }
    235 
    236     // Metadata about the allocation region.
    237     void* ptr_ = nullptr;
    238     size_t memory_size_ = 0;
    239     void* end_ptr_ = nullptr;
    240 
    241     // Array of size "memory_size / kMinAllocationSize".  It is
    242     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
    243     // for the memory allocation represented by "p"
    244     ChunkHandle* handles_ = nullptr;
    245 
    246     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
    247   };
    248 
    249   // RegionManager aggregates one or more "AllocationRegions" and provides
    250   // a layer of indirection from pointers to the underlying ChunkHandle,
    251   // allowing allocation across multiple discontiguous memory regions.
    252   //
    253   // This class is thread-compatible.
    254   class RegionManager {
    255    public:
    256     RegionManager() {}
    257     ~RegionManager() {}
    258 
    259     void AddAllocationRegion(void* ptr, size_t memory_size) {
    260       // Insert sorted by end_ptr
    261       auto entry =
    262           std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator);
    263       regions_.insert(entry, AllocationRegion(ptr, memory_size));
    264     }
    265 
    266     ChunkHandle get_handle(const void* p) const {
    267       return RegionFor(p)->get_handle(p);
    268     }
    269 
    270     void set_handle(const void* p, ChunkHandle h) {
    271       return MutableRegionFor(p)->set_handle(p, h);
    272     }
    273     void erase(const void* p) { return MutableRegionFor(p)->erase(p); }
    274 
    275     const std::vector<AllocationRegion>& regions() const { return regions_; }
    276 
    277    private:
    278     static bool Comparator(const void* ptr, const AllocationRegion& other) {
    279       return ptr < other.end_ptr();
    280     }
    281 
    282     AllocationRegion* MutableRegionFor(const void* p) {
    283       return const_cast<AllocationRegion*>(RegionFor(p));
    284     }
    285 
    286     const AllocationRegion* RegionFor(const void* p) const {
    287       auto entry =
    288           std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator);
    289 
    290       if (entry != regions_.end()) {
    291         return &(*entry);
    292       }
    293 
    294       LOG(FATAL) << "Could not find Region for " << p;
    295       return nullptr;
    296     }
    297 
    298    private:
    299     std::vector<AllocationRegion> regions_;
    300   };
    301 
    302   // Returns 'bytes' rounded up to the next highest kMinAllocationSize.
    303   size_t RoundedBytes(size_t bytes);
    304 
    305   // Try to add a new memory region that can satisfy an allocation of
    306   // 'rounded_bytes' bytes.  Returns true on success and false on
    307   // failure.
    308   bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    309 
    310   // Returns a pointer to an underlying allocated chunk of size
    311   // 'rounded_bytes'.
    312   void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes)
    313       EXCLUSIVE_LOCKS_REQUIRED(lock_);
    314 
    315   // Splits the chunk specified by 'h' into two chunks, one at least
    316   // of size 'num_bytes'.
    317   void SplitChunk(ChunkHandle h, size_t num_bytes)
    318       EXCLUSIVE_LOCKS_REQUIRED(lock_);
    319 
    320   // Merges the two chunk handles.  Requires that the chunks are
    321   // contiguous in their allocation.
    322   void Merge(ChunkHandle h, ChunkHandle h2) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    323 
    324   // Frees the memory represented by 'h', coalescing the chunk if
    325   // possible.
    326   void FreeAndMaybeCoalesce(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    327 
    328   // Adds the chunk 'h' to the proper free bin.
    329   void InsertFreeChunkIntoBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    330 
    331   // Removes the free chunk pointed to by 'c' from the set free_chunks.
    332   void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks,
    333                                   const Bin::FreeChunkSet::iterator& c)
    334       EXCLUSIVE_LOCKS_REQUIRED(lock_);
    335 
    336   // Removes a free chunk from the bin.
    337   void RemoveFreeChunkFromBin(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    338 
    339   // Removes the chunk metadata represented by 'h'.
    340   void DeleteChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    341 
    342   string RenderOccupancy() EXCLUSIVE_LOCKS_REQUIRED(lock_);
    343   void DumpMemoryLog(size_t num_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    344 
    345   ChunkHandle AllocateChunk() EXCLUSIVE_LOCKS_REQUIRED(lock_);
    346   void DeallocateChunk(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    347 
    348   Chunk* ChunkFromHandle(ChunkHandle h) EXCLUSIVE_LOCKS_REQUIRED(lock_);
    349 
    350   // Information about a Bin that is useful for debugging.
    351   struct BinDebugInfo {
    352     size_t total_bytes_in_use = 0;
    353     size_t total_bytes_in_bin = 0;
    354     size_t total_requested_bytes_in_use = 0;
    355     size_t total_chunks_in_use = 0;
    356     size_t total_chunks_in_bin = 0;
    357   };
    358 
    359   // Computes and returns a BinDebugInfo for each Bin.
    360   std::array<BinDebugInfo, kNumBins> get_bin_debug_info()
    361       EXCLUSIVE_LOCKS_REQUIRED(lock_);
    362 
    363   AllocatorRetry retry_helper_;
    364 
    365   // Structures immutable after construction
    366   size_t memory_limit_ = 0;
    367 
    368   inline int Log2FloorNonZeroSlow(uint64 n) {
    369     int r = 0;
    370     while (n > 0) {
    371       r++;
    372       n >>= 1;
    373     }
    374     return r - 1;
    375   }
    376 
    377   // Returns floor(log2(n)).
    378   inline int Log2FloorNonZero(uint64 n) {
    379 #if defined(__GNUC__)
    380     return 63 ^ __builtin_clzll(n);
    381 #elif defined(PLATFORM_WINDOWS)
    382     unsigned long index;
    383     _BitScanReverse64(&index, n);
    384     return index;
    385 #else
    386     return Log2FloorNonZeroSlow(n);
    387 #endif
    388   }
    389 
    390   // Map from bin size to Bin
    391   Bin* BinFromIndex(BinNum index) {
    392     return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)]));
    393   }
    394   size_t BinNumToSize(BinNum index) {
    395     return static_cast<size_t>(256) << index;
    396   }
    397   BinNum BinNumForSize(size_t bytes) {
    398     uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits;
    399     int b = std::min(kNumBins - 1, Log2FloorNonZero(v));
    400     return b;
    401   }
    402   Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); }
    403 
    404   char bins_space_[sizeof(Bin) * kNumBins];
    405 
    406   // The size of the current region allocation.
    407   size_t curr_region_allocation_bytes_;
    408 
    409   // The total number of allocated bytes by the allocator.
    410   size_t total_region_allocated_bytes_ = 0;
    411 
    412   // An indicator that expansion of a region has hit the limits
    413   // of the available memory.
    414   bool started_backpedal_ = false;
    415 
    416   std::unique_ptr<SubAllocator> suballocator_;
    417   string name_;
    418 
    419   // Structures mutable after construction
    420   mutable mutex lock_;
    421   RegionManager region_manager_ GUARDED_BY(lock_);
    422 
    423   std::vector<Chunk> chunks_ GUARDED_BY(lock_);
    424 
    425   // Pointer to head of linked list of free Chunks
    426   ChunkHandle free_chunks_list_ GUARDED_BY(lock_);
    427 
    428   // Called once on each region, ASAP.
    429   std::vector<Visitor> region_visitors_ GUARDED_BY(lock_);
    430 
    431   // Counter containing the next unique identifier to assign to a
    432   // newly-created chunk.
    433   int64 next_allocation_id_ GUARDED_BY(lock_);
    434 
    435   // Stats.
    436   AllocatorStats stats_ GUARDED_BY(lock_);
    437 
    438   friend class GPUBFCAllocatorPrivateMethodsTest;
    439   TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator);
    440 };
    441 
    442 }  // namespace tensorflow
    443 
    444 #endif  // TENSORFLOW_COMMON_RUNTIME_BFC_ALLOCATOR_H_
    445