1 // Copyright (c) 2008, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 // --- 31 // Author: Sanjay Ghemawat <opensource (at) google.com> 32 33 #ifndef TCMALLOC_THREAD_CACHE_H_ 34 #define TCMALLOC_THREAD_CACHE_H_ 35 36 #include <config.h> 37 #ifdef HAVE_PTHREAD 38 #include <pthread.h> // for pthread_t, pthread_key_t 39 #endif 40 #include <stddef.h> // for size_t, NULL 41 #ifdef HAVE_STDINT_H 42 #include <stdint.h> // for uint32_t, uint64_t 43 #endif 44 #include <sys/types.h> // for ssize_t 45 #include "common.h" // for SizeMap, kMaxSize, etc 46 #include "free_list.h" // for FL_Pop, FL_PopRange, etc 47 #include "internal_logging.h" // for ASSERT, etc 48 #include "maybe_threads.h" 49 #include "page_heap_allocator.h" // for PageHeapAllocator 50 #include "sampler.h" // for Sampler 51 #include "static_vars.h" // for Static 52 53 namespace tcmalloc { 54 55 // Even if we have support for thread-local storage in the compiler 56 // and linker, the OS may not support it. We need to check that at 57 // runtime. Right now, we have to keep a manual set of "bad" OSes. 58 #if defined(HAVE_TLS) 59 extern bool kernel_supports_tls; // defined in thread_cache.cc 60 void CheckIfKernelSupportsTLS(); 61 inline bool KernelSupportsTLS() { 62 return kernel_supports_tls; 63 } 64 #endif // HAVE_TLS 65 66 //------------------------------------------------------------------- 67 // Data kept per thread 68 //------------------------------------------------------------------- 69 70 class ThreadCache { 71 public: 72 // All ThreadCache objects are kept in a linked list (for stats collection) 73 ThreadCache* next_; 74 ThreadCache* prev_; 75 76 void Init(pthread_t tid); 77 void Cleanup(); 78 79 // Accessors (mostly just for printing stats) 80 int freelist_length(size_t cl) const { return list_[cl].length(); } 81 82 // Total byte size in cache 83 size_t Size() const { return size_; } 84 85 // Allocate an object of the given size and class. The size given 86 // must be the same as the size of the class in the size map. 87 void* Allocate(size_t size, size_t cl); 88 void Deallocate(void* ptr, size_t size_class); 89 90 void Scavenge(); 91 92 int GetSamplePeriod(); 93 94 // Record allocation of "k" bytes. Return true iff allocation 95 // should be sampled 96 bool SampleAllocation(size_t k); 97 98 // Record additional bytes allocated. 99 void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; } 100 101 // Return the total number of bytes allocated from this heap. The value will 102 // wrap when there is an overflow, and so only the differences between two 103 // values should be relied on (and even then, modulo 2^32). 104 uint32 GetTotalBytesAllocated() const; 105 106 // On the current thread, return GetTotalBytesAllocated(). 107 static uint32 GetBytesAllocatedOnCurrentThread(); 108 109 static void InitModule(); 110 static void InitTSD(); 111 static ThreadCache* GetThreadHeap(); 112 static ThreadCache* GetCache(); 113 static ThreadCache* GetCacheIfPresent(); 114 static ThreadCache* CreateCacheIfNecessary(); 115 static void BecomeIdle(); 116 117 // Return the number of thread heaps in use. 118 static inline int HeapsInUse(); 119 120 // Writes to total_bytes the total number of bytes used by all thread heaps. 121 // class_count must be an array of size kNumClasses. Writes the number of 122 // items on the corresponding freelist. class_count may be NULL. 123 // The storage of both parameters must be zero intialized. 124 // REQUIRES: Static::pageheap_lock is held. 125 static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count); 126 127 // Sets the total thread cache size to new_size, recomputing the 128 // individual thread cache sizes as necessary. 129 // REQUIRES: Static::pageheap lock is held. 130 static void set_overall_thread_cache_size(size_t new_size); 131 static size_t overall_thread_cache_size() { 132 return overall_thread_cache_size_; 133 } 134 135 private: 136 class FreeList { 137 private: 138 void* list_; // Linked list of nodes 139 140 #ifdef _LP64 141 // On 64-bit hardware, manipulating 16-bit values may be slightly slow. 142 uint32_t length_; // Current length. 143 uint32_t lowater_; // Low water mark for list length. 144 uint32_t max_length_; // Dynamic max list length based on usage. 145 // Tracks the number of times a deallocation has caused 146 // length_ > max_length_. After the kMaxOverages'th time, max_length_ 147 // shrinks and length_overages_ is reset to zero. 148 uint32_t length_overages_; 149 #else 150 // If we aren't using 64-bit pointers then pack these into less space. 151 uint16_t length_; 152 uint16_t lowater_; 153 uint16_t max_length_; 154 uint16_t length_overages_; 155 #endif 156 157 public: 158 void Init() { 159 list_ = NULL; 160 length_ = 0; 161 lowater_ = 0; 162 max_length_ = 1; 163 length_overages_ = 0; 164 } 165 166 // Return current length of list 167 size_t length() const { 168 return length_; 169 } 170 171 // Return the maximum length of the list. 172 size_t max_length() const { 173 return max_length_; 174 } 175 176 // Set the maximum length of the list. If 'new_max' > length(), the 177 // client is responsible for removing objects from the list. 178 void set_max_length(size_t new_max) { 179 max_length_ = new_max; 180 } 181 182 // Return the number of times that length() has gone over max_length(). 183 size_t length_overages() const { 184 return length_overages_; 185 } 186 187 void set_length_overages(size_t new_count) { 188 length_overages_ = new_count; 189 } 190 191 // Is list empty? 192 bool empty() const { 193 return list_ == NULL; 194 } 195 196 // Low-water mark management 197 int lowwatermark() const { return lowater_; } 198 void clear_lowwatermark() { lowater_ = length_; } 199 200 void Push(void* ptr) { 201 FL_Push(&list_, ptr); 202 length_++; 203 } 204 205 void* Pop() { 206 ASSERT(list_ != NULL); 207 length_--; 208 if (length_ < lowater_) lowater_ = length_; 209 return FL_Pop(&list_); 210 } 211 212 void* Next() { 213 if (list_ == NULL) return NULL; 214 return FL_Next(list_); 215 } 216 217 void PushRange(int N, void *start, void *end) { 218 FL_PushRange(&list_, start, end); 219 length_ += N; 220 } 221 222 void PopRange(int N, void **start, void **end) { 223 FL_PopRange(&list_, N, start, end); 224 ASSERT(length_ >= N); 225 length_ -= N; 226 if (length_ < lowater_) lowater_ = length_; 227 } 228 }; 229 230 // Gets and returns an object from the central cache, and, if possible, 231 // also adds some objects of that size class to this thread cache. 232 void* FetchFromCentralCache(size_t cl, size_t byte_size); 233 234 // Releases some number of items from src. Adjusts the list's max_length 235 // to eventually converge on num_objects_to_move(cl). 236 void ListTooLong(FreeList* src, size_t cl); 237 238 // Releases N items from this thread cache. 239 void ReleaseToCentralCache(FreeList* src, size_t cl, int N); 240 241 // Increase max_size_ by reducing unclaimed_cache_space_ or by 242 // reducing the max_size_ of some other thread. In both cases, 243 // the delta is kStealAmount. 244 void IncreaseCacheLimit(); 245 // Same as above but requires Static::pageheap_lock() is held. 246 void IncreaseCacheLimitLocked(); 247 248 // If TLS is available, we also store a copy of the per-thread object 249 // in a __thread variable since __thread variables are faster to read 250 // than pthread_getspecific(). We still need pthread_setspecific() 251 // because __thread variables provide no way to run cleanup code when 252 // a thread is destroyed. 253 // We also give a hint to the compiler to use the "initial exec" TLS 254 // model. This is faster than the default TLS model, at the cost that 255 // you cannot dlopen this library. (To see the difference, look at 256 // the CPU use of __tls_get_addr with and without this attribute.) 257 // Since we don't really use dlopen in google code -- and using dlopen 258 // on a malloc replacement is asking for trouble in any case -- that's 259 // a good tradeoff for us. 260 #ifdef HAVE_TLS 261 static __thread ThreadCache* threadlocal_heap_ 262 // This code links against pyautolib.so, which causes dlopen() on that shared 263 // object to fail when -fprofile-generate is used with it. Ideally 264 // pyautolib.so should not link against this code. There is a bug filed for 265 // that: 266 // http://code.google.com/p/chromium/issues/detail?id=124489 267 // For now the workaround is to pass in -DPGO_GENERATE when building Chrome 268 // for instrumentation (-fprofile-generate). 269 // For all non-instrumentation builds, this define will not be set and the 270 // performance benefit of "intial-exec" will be achieved. 271 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) 272 __attribute__ ((tls_model ("initial-exec"))) 273 # endif 274 ; 275 #endif 276 277 // Thread-specific key. Initialization here is somewhat tricky 278 // because some Linux startup code invokes malloc() before it 279 // is in a good enough state to handle pthread_keycreate(). 280 // Therefore, we use TSD keys only after tsd_inited is set to true. 281 // Until then, we use a slow path to get the heap object. 282 static bool tsd_inited_; 283 static pthread_key_t heap_key_; 284 285 // Linked list of heap objects. Protected by Static::pageheap_lock. 286 static ThreadCache* thread_heaps_; 287 static int thread_heap_count_; 288 289 // A pointer to one of the objects in thread_heaps_. Represents 290 // the next ThreadCache from which a thread over its max_size_ should 291 // steal memory limit. Round-robin through all of the objects in 292 // thread_heaps_. Protected by Static::pageheap_lock. 293 static ThreadCache* next_memory_steal_; 294 295 // Overall thread cache size. Protected by Static::pageheap_lock. 296 static size_t overall_thread_cache_size_; 297 298 // Global per-thread cache size. Writes are protected by 299 // Static::pageheap_lock. Reads are done without any locking, which should be 300 // fine as long as size_t can be written atomically and we don't place 301 // invariants between this variable and other pieces of state. 302 static volatile size_t per_thread_cache_size_; 303 304 // Represents overall_thread_cache_size_ minus the sum of max_size_ 305 // across all ThreadCaches. Protected by Static::pageheap_lock. 306 static ssize_t unclaimed_cache_space_; 307 308 // This class is laid out with the most frequently used fields 309 // first so that hot elements are placed on the same cache line. 310 311 size_t size_; // Combined size of data 312 size_t max_size_; // size_ > max_size_ --> Scavenge() 313 314 // The following is the tally of bytes allocated on a thread as a response to 315 // any flavor of malloc() call. The aggegated amount includes all padding to 316 // the smallest class that can hold the request, or to the nearest whole page 317 // when a large allocation is made without using a class. This sum is 318 // currently used for Chromium profiling, where tallies are kept of the amount 319 // of memory allocated during the running of each task on each thread. 320 uint32 total_bytes_allocated_; // Total, modulo 2^32. 321 322 // We sample allocations, biased by the size of the allocation 323 Sampler sampler_; // A sampler 324 325 FreeList list_[kNumClasses]; // Array indexed by size-class 326 327 pthread_t tid_; // Which thread owns it 328 bool in_setspecific_; // In call to pthread_setspecific? 329 330 // Allocate a new heap. REQUIRES: Static::pageheap_lock is held. 331 static ThreadCache* NewHeap(pthread_t tid); 332 333 // Use only as pthread thread-specific destructor function. 334 static void DestroyThreadCache(void* ptr); 335 336 static void DeleteCache(ThreadCache* heap); 337 static void RecomputePerThreadCacheSize(); 338 339 // Ensure that this class is cacheline-aligned. This is critical for 340 // performance, as false sharing would negate many of the benefits 341 // of a per-thread cache. 342 } CACHELINE_ALIGNED; 343 344 // Allocator for thread heaps 345 // This is logically part of the ThreadCache class, but MSVC, at 346 // least, does not like using ThreadCache as a template argument 347 // before the class is fully defined. So we put it outside the class. 348 extern PageHeapAllocator<ThreadCache> threadcache_allocator; 349 350 inline int ThreadCache::HeapsInUse() { 351 return threadcache_allocator.inuse(); 352 } 353 354 inline bool ThreadCache::SampleAllocation(size_t k) { 355 return sampler_.SampleAllocation(k); 356 } 357 358 inline uint32 ThreadCache::GetTotalBytesAllocated() const { 359 return total_bytes_allocated_; 360 } 361 362 inline void* ThreadCache::Allocate(size_t size, size_t cl) { 363 ASSERT(size <= kMaxSize); 364 ASSERT(size == Static::sizemap()->ByteSizeForClass(cl)); 365 366 FreeList* list = &list_[cl]; 367 if (list->empty()) { 368 return FetchFromCentralCache(cl, size); 369 } 370 size_ -= size; 371 return list->Pop(); 372 } 373 374 inline void ThreadCache::Deallocate(void* ptr, size_t cl) { 375 FreeList* list = &list_[cl]; 376 size_ += Static::sizemap()->ByteSizeForClass(cl); 377 ssize_t size_headroom = max_size_ - size_ - 1; 378 379 // This catches back-to-back frees of allocs in the same size 380 // class. A more comprehensive (and expensive) test would be to walk 381 // the entire freelist. But this might be enough to find some bugs. 382 ASSERT(ptr != list->Next()); 383 384 list->Push(ptr); 385 ssize_t list_headroom = 386 static_cast<ssize_t>(list->max_length()) - list->length(); 387 388 // There are two relatively uncommon things that require further work. 389 // In the common case we're done, and in that case we need a single branch 390 // because of the bitwise-or trick that follows. 391 if ((list_headroom | size_headroom) < 0) { 392 if (list_headroom < 0) { 393 ListTooLong(list, cl); 394 } 395 if (size_ >= max_size_) Scavenge(); 396 } 397 } 398 399 inline ThreadCache* ThreadCache::GetThreadHeap() { 400 #ifdef HAVE_TLS 401 // __thread is faster, but only when the kernel supports it 402 if (KernelSupportsTLS()) 403 return threadlocal_heap_; 404 #endif 405 return reinterpret_cast<ThreadCache *>( 406 perftools_pthread_getspecific(heap_key_)); 407 } 408 409 inline ThreadCache* ThreadCache::GetCache() { 410 ThreadCache* ptr = NULL; 411 if (!tsd_inited_) { 412 InitModule(); 413 } else { 414 ptr = GetThreadHeap(); 415 } 416 if (ptr == NULL) ptr = CreateCacheIfNecessary(); 417 return ptr; 418 } 419 420 // In deletion paths, we do not try to create a thread-cache. This is 421 // because we may be in the thread destruction code and may have 422 // already cleaned up the cache for this thread. 423 inline ThreadCache* ThreadCache::GetCacheIfPresent() { 424 if (!tsd_inited_) return NULL; 425 return GetThreadHeap(); 426 } 427 428 } // namespace tcmalloc 429 430 #endif // TCMALLOC_THREAD_CACHE_H_ 431