1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ 17 #define TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ 18 19 // Simple LRU pool allocators for various flavors of CPU RAM that 20 // implement the VisitableAllocator interface. GPU memory is managed 21 // by GPURegionAllocator. 22 23 #include <atomic> 24 #include <map> 25 #include <memory> 26 #include <vector> 27 #include "tensorflow/core/common_runtime/visitable_allocator.h" 28 #include "tensorflow/core/lib/core/bits.h" 29 #include "tensorflow/core/platform/logging.h" 30 #include "tensorflow/core/platform/macros.h" 31 #include "tensorflow/core/platform/mem.h" 32 #include "tensorflow/core/platform/mutex.h" 33 #include "tensorflow/core/platform/stream_executor.h" 34 #include "tensorflow/core/platform/types.h" 35 36 namespace tensorflow { 37 38 // Interface of an object that rounds up integers. 39 class RoundUpInterface { 40 public: 41 virtual ~RoundUpInterface() {} 42 virtual size_t RoundUp(size_t num_bytes) = 0; 43 }; 44 45 // Size-limited pool of memory buffers obtained from a SubAllocator 46 // instance. Pool eviction policy is LRU. 47 class PoolAllocator : public VisitableAllocator { 48 public: 49 // "pool_size_limit" is the maximum number of returned, re-usable 50 // memory buffers to keep in the pool. If pool_size_limit == 0, the 51 // pool is effectively a thin wrapper around the allocator. 52 // If "auto_resize" is true, then the pool_size_limit will gradually 53 // be raised so that deallocations happen very rarely, if at all. 54 // Transitory start-up objects may deallocate, but the long-term 55 // working-set should not. Auto-resizing can raise pool_size_limit 56 // but will never lower it. 57 // "allocator" is the object that performs the underlying memory 58 // malloc/free operations. This object takes ownership of allocator. 59 PoolAllocator(size_t pool_size_limit, bool auto_resize, 60 SubAllocator* allocator, RoundUpInterface* size_rounder, 61 string name); 62 ~PoolAllocator() override; 63 64 string Name() override { return name_; } 65 66 void* AllocateRaw(size_t alignment, size_t num_bytes) override; 67 68 void DeallocateRaw(void* ptr) override; 69 70 // REQUIRES: The following functions may only be called prior 71 // to the first Allocate*() call. Once allocation has begun, it is 72 // illegal to register another visitor. 73 74 void AddAllocVisitor(Visitor visitor) override; 75 76 void AddFreeVisitor(Visitor visitor) override; 77 78 // Allocate an unused memory region of size "num_bytes". Fetch from 79 // the pool if available, otherwise call allocator_. 80 void* Get(size_t num_bytes); 81 82 // Return a no-longer needed memory region to the pool. It is an error 83 // to deference "ptr" after this call. If the pool is full, the least 84 // recently used region will be deallocated. 85 void Put(void* ptr, size_t num_bytes); 86 87 // Reset the pool to empty. 88 void Clear(); 89 90 // The following accessors permit monitoring the effectiveness of 91 // the pool at avoiding repeated malloc/frees on the underlying 92 // allocator. Read locks are not taken on the theory that value 93 // consistency with other threads is not important. 94 95 // Number of Get() requests satisfied from pool. 96 int64 get_from_pool_count() const NO_THREAD_SAFETY_ANALYSIS { 97 return get_from_pool_count_; 98 } 99 // Number of Put() requests. 100 int64 put_count() const NO_THREAD_SAFETY_ANALYSIS { return put_count_; } 101 // Number of Get() requests requiring a fresh allocation. 102 int64 allocated_count() const NO_THREAD_SAFETY_ANALYSIS { 103 return allocated_count_; 104 } 105 // Number of pool evictions. 106 int64 evicted_count() const NO_THREAD_SAFETY_ANALYSIS { 107 return evicted_count_; 108 } 109 // Current size limit. 110 size_t size_limit() const NO_THREAD_SAFETY_ANALYSIS { 111 return pool_size_limit_; 112 } 113 114 void GetStats(AllocatorStats* stats) override { stats->Clear(); } 115 116 private: 117 struct PtrRecord { 118 void* ptr; 119 size_t num_bytes; 120 PtrRecord* prev; 121 PtrRecord* next; 122 }; 123 124 // Remove "pr" from the double-linked LRU list. 125 void RemoveFromList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); 126 127 // Add "pr" to the head of the double-linked LRU list. 128 void AddToList(PtrRecord* pr) EXCLUSIVE_LOCKS_REQUIRED(mutex_); 129 130 // Delete the least recently used record. 131 void EvictOne() EXCLUSIVE_LOCKS_REQUIRED(mutex_); 132 133 const string name_; 134 const bool has_size_limit_; 135 const bool auto_resize_; 136 size_t pool_size_limit_; 137 std::unique_ptr<SubAllocator> allocator_; 138 std::unique_ptr<RoundUpInterface> size_rounder_; 139 mutex mutex_; 140 std::multimap<const size_t, PtrRecord*> pool_ GUARDED_BY(mutex_); 141 PtrRecord* lru_head_ GUARDED_BY(mutex_) = nullptr; 142 PtrRecord* lru_tail_ GUARDED_BY(mutex_) = nullptr; 143 int64 get_from_pool_count_ GUARDED_BY(mutex_) = 0; 144 int64 put_count_ GUARDED_BY(mutex_) = 0; 145 int64 allocated_count_ GUARDED_BY(mutex_) = 0; 146 int64 evicted_count_ GUARDED_BY(mutex_) = 0; 147 // Write access to these is guarded by mutex_, but not read 148 // access. They may only be modified prior to the first 149 // allocation. Later attempts to modify will fail. 150 std::vector<Visitor> alloc_visitors_; 151 std::vector<Visitor> free_visitors_; 152 std::atomic<bool> allocation_begun_; 153 }; 154 155 // Do-nothing rounder. Passes through sizes unchanged. 156 class NoopRounder : public RoundUpInterface { 157 public: 158 size_t RoundUp(size_t num_bytes) override { return num_bytes; } 159 }; 160 161 // Power of 2 rounder: rounds up to nearest power of 2 size. 162 class Pow2Rounder : public RoundUpInterface { 163 public: 164 size_t RoundUp(size_t num_bytes) override { 165 return 1uLL << Log2Ceiling64(num_bytes); 166 } 167 }; 168 169 class BasicCPUAllocator : public SubAllocator { 170 public: 171 ~BasicCPUAllocator() override {} 172 173 void* Alloc(size_t alignment, size_t num_bytes) override { 174 return port::AlignedMalloc(num_bytes, alignment); 175 } 176 void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } 177 }; 178 179 // Allocator for pinned CPU RAM that is made known to CUDA for the 180 // purpose of efficient DMA with a GPU. 181 class CUDAHostAllocator : public SubAllocator { 182 public: 183 // Note: stream_exec cannot be null. 184 explicit CUDAHostAllocator(perftools::gputools::StreamExecutor* stream_exec) 185 : stream_exec_(stream_exec) { 186 CHECK(stream_exec_ != nullptr); 187 } 188 ~CUDAHostAllocator() override {} 189 190 void* Alloc(size_t alignment, size_t num_bytes) override { 191 void* ptr = nullptr; 192 if (num_bytes > 0) { 193 ptr = stream_exec_->HostMemoryAllocate(num_bytes); 194 if (ptr == nullptr) { 195 LOG(WARNING) << "could not allocate pinned host memory of size: " 196 << num_bytes; 197 } 198 } 199 return ptr; 200 } 201 202 void Free(void* ptr, size_t num_bytes) override { 203 if (ptr != nullptr) { 204 stream_exec_->HostMemoryDeallocate(ptr); 205 } 206 } 207 208 private: 209 perftools::gputools::StreamExecutor* stream_exec_; // not owned, non-null 210 211 TF_DISALLOW_COPY_AND_ASSIGN(CUDAHostAllocator); 212 }; 213 214 } // namespace tensorflow 215 #endif // TENSORFLOW_COMMON_RUNTIME_GPU_POOL_ALLOCATOR_H_ 216