Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
     17 #define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
     18 
     19 // Simple LRU pool allocators for various flavors of CPU RAM that
     20 // implement the VisitableAllocator interface. GPU memory is managed
     21 // by GPURegionAllocator.
     22 
     23 #include <atomic>
     24 #include <map>
     25 #include <memory>
     26 #include <vector>
     27 #include "tensorflow/core/common_runtime/visitable_allocator.h"
     28 #include "tensorflow/core/lib/core/bits.h"
     29 #include "tensorflow/core/platform/logging.h"
     30 #include "tensorflow/core/platform/macros.h"
     31 #include "tensorflow/core/platform/mem.h"
     32 #include "tensorflow/core/platform/mutex.h"
     33 #include "tensorflow/core/platform/stream_executor.h"
     34 #include "tensorflow/core/platform/types.h"
     35 
     36 namespace tensorflow {
     37 
     38 // Interface of an object that rounds up integers.
     39 class RoundUpInterface {
     40  public:
     41   virtual ~RoundUpInterface() {}
     42   virtual size_t RoundUp(size_t num_bytes) = 0;
     43 };
     44 
     45 // Size-limited pool of memory buffers obtained from a SubAllocator
     46 // instance.  Pool eviction policy is LRU.
     47 class PoolAllocator : public VisitableAllocator {
     48  public:
     49   // "pool_size_limit" is the maximum number of returned, re-usable
     50   // memory buffers to keep in the pool.  If pool_size_limit == 0, the
     51   // pool is effectively a thin wrapper around the allocator.
     52   // If "auto_resize" is true, then the pool_size_limit will gradually
     53   // be raised so that deallocations happen very rarely, if at all.
     54   // Transitory start-up objects may deallocate, but the long-term
     55   // working-set should not. Auto-resizing can raise pool_size_limit
     56   // but will never lower it.
     57   // "allocator" is the object that performs the underlying memory
     58   // malloc/free operations.  This object takes ownership of allocator.
     59   PoolAllocator(size_t pool_size_limit, bool auto_resize,
     60                 SubAllocator* allocator, RoundUpInterface* size_rounder,
     61                 string name);
     62   ~PoolAllocator() override;
     63 
     64   string Name() override { return name_; }
     65 
     66   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
     67 
     68   void DeallocateRaw(void* ptr) override;
     69 
     70   // REQUIRES: The following functions may only be called prior
     71   // to the first Allocate*() call.  Once allocation has begun, it is
     72   // illegal to register another visitor.
     73 
     74   void AddAllocVisitor(Visitor visitor) override;
     75 
     76   void AddFreeVisitor(Visitor visitor) override;
     77 
     78   // Allocate an unused memory region of size "num_bytes".  Fetch from
     79   // the pool if available, otherwise call allocator_.
     80   void* Get(size_t num_bytes);
     81 
     82   // Return a no-longer needed memory region to the pool.  It is an error
     83   // to deference "ptr" after this call.  If the pool is full, the least
     84   // recently used region will be deallocated.
     85   void Put(void* ptr, size_t num_bytes);
     86 
     87   // Reset the pool to empty.
     88   void Clear();
     89 
     90   // The following accessors permit monitoring the effectiveness of
     91   // the pool at avoiding repeated malloc/frees on the underlying
     92   // allocator.  Read locks are not taken on the theory that value
     93   // consistency with other threads is not important.
     94 
     95   // Number of Get() requests satisfied from pool.
     96   int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS {
     97     return get_from_pool_count_;
     98   }
     99   // Number of Put() requests.
    100   int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; }
    101   // Number of Get() requests requiring a fresh allocation.
    102   int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS {
    103     return allocated_count_;
    104   }
    105   // Number of pool evictions.
    106   int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS {
    107     return evicted_count_;
    108   }
    109   // Current size limit.
    110   size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS {
    111     return pool_size_limit_;
    112   }
    113 
    114   void GetStats(AllocatorStats* stats) override { stats->Clear(); }
    115 
    116  private:
    117   struct PtrRecord {
    118     void* ptr;
    119     size_t num_bytes;
    120     PtrRecord* prev;
    121     PtrRecord* next;
    122   };
    123 
    124   // Remove "pr" from the double-linked LRU list.
    125   void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
    126 
    127   // Add "pr" to the head of the double-linked LRU list.
    128   void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_);
    129 
    130   // Delete the least recently used record.
    131   void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_);
    132 
    133   const string name_;
    134   const bool has_size_limit_;
    135   const bool auto_resize_;
    136   size_t pool_size_limit_;
    137   std::unique_ptr<SubAllocator> allocator_;
    138   std::unique_ptr<RoundUpInterface> size_rounder_;
    139   mutex mutex_;
    140   std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_);
    141   PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr;
    142   PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr;
    143   int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0;
    144   int64 put_count_ GUARDED_BY(mutex_) = 0;
    145   int64 allocated_count_ GUARDED_BY(mutex_) = 0;
    146   int64 evicted_count_ GUARDED_BY(mutex_) = 0;
    147   // Write access to these is guarded by mutex_, but not read
    148   // access. They may only be modified prior to the first
    149   // allocation.  Later attempts to modify will fail.
    150   std::vector<Visitor> alloc_visitors_;
    151   std::vector<Visitor> free_visitors_;
    152   std::atomic<bool> allocation_begun_;
    153 };
    154 
    155 // Do-nothing rounder. Passes through sizes unchanged.
    156 class NoopRounder : public RoundUpInterface {
    157  public:
    158   size_t RoundUp(size_t num_bytes) override { return num_bytes; }
    159 };
    160 
    161 // Power of 2 rounder: rounds up to nearest power of 2 size.
    162 class Pow2Rounder : public RoundUpInterface {
    163  public:
    164   size_t RoundUp(size_t num_bytes) override {
    165     return 1uLL << Log2Ceiling64(num_bytes);
    166   }
    167 };
    168 
    169 class BasicCPUAllocator : public SubAllocator {
    170  public:
    171   ~BasicCPUAllocator() override {}
    172 
    173   void* Alloc(size_t alignment, size_t num_bytes) override {
    174     return port::AlignedMalloc(num_bytes, alignment);
    175   }
    176   void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); }
    177 };
    178 
    179 // Allocator for pinned CPU RAM that is made known to CUDA for the
    180 // purpose of efficient DMA with a GPU.
    181 class CUDAHostAllocator : public SubAllocator {
    182  public:
    183   // Note: stream_exec cannot be null.
    184   explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec)
    185       : stream_exec_(stream_exec) {
    186     CHECK(stream_exec_ != nullptr);
    187   }
    188   ~CUDAHostAllocator() override {}
    189 
    190   void* Alloc(size_t alignment, size_t num_bytes) override {
    191     void* ptr = nullptr;
    192     if (num_bytes > 0) {
    193       ptr = stream_exec_->HostMemoryAllocate(num_bytes);
    194       if (ptr == nullptr) {
    195         LOG(WARNING) << "could not allocate pinned host memory of size: "
    196                      << num_bytes;
    197       }
    198     }
    199     return ptr;
    200   }
    201 
    202   void Free(void* ptr, size_t num_bytes) override {
    203     if (ptr != nullptr) {
    204       stream_exec_->HostMemoryDeallocate(ptr);
    205     }
    206   }
    207 
    208  private:
    209   perftools::gputools::StreamExecutor* stream_exec_;  // not owned, non-null
    210 
    211   TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator);
    212 };
    213 
    214 }  // namespace tensorflow
    215 #endif  // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_
    216