Home | History | Annotate | Download | only in src
      1 // Copyright (c) 2008, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 // ---
     31 // Author: Sanjay Ghemawat <opensource (at) google.com>
     32 
     33 #ifndef TCMALLOC_THREAD_CACHE_H_
     34 #define TCMALLOC_THREAD_CACHE_H_
     35 
     36 #include <config.h>
     37 #ifdef HAVE_PTHREAD
     38 #include <pthread.h>                    // for pthread_t, pthread_key_t
     39 #endif
     40 #include <stddef.h>                     // for size_t, NULL
     41 #ifdef HAVE_STDINT_H
     42 #include <stdint.h>                     // for uint32_t, uint64_t
     43 #endif
     44 #include <sys/types.h>                  // for ssize_t
     45 #include "common.h"            // for SizeMap, kMaxSize, etc
     46 #include "free_list.h"  // for FL_Pop, FL_PopRange, etc
     47 #include "internal_logging.h"  // for ASSERT, etc
     48 #include "maybe_threads.h"
     49 #include "page_heap_allocator.h"  // for PageHeapAllocator
     50 #include "sampler.h"           // for Sampler
     51 #include "static_vars.h"       // for Static
     52 
     53 namespace tcmalloc {
     54 
     55 // Even if we have support for thread-local storage in the compiler
     56 // and linker, the OS may not support it.  We need to check that at
     57 // runtime.  Right now, we have to keep a manual set of "bad" OSes.
     58 #if defined(HAVE_TLS)
     59 extern bool kernel_supports_tls;   // defined in thread_cache.cc
     60 void CheckIfKernelSupportsTLS();
     61 inline bool KernelSupportsTLS() {
     62   return kernel_supports_tls;
     63 }
     64 #endif    // HAVE_TLS
     65 
     66 //-------------------------------------------------------------------
     67 // Data kept per thread
     68 //-------------------------------------------------------------------
     69 
     70 class ThreadCache {
     71  public:
     72   // All ThreadCache objects are kept in a linked list (for stats collection)
     73   ThreadCache* next_;
     74   ThreadCache* prev_;
     75 
     76   void Init(pthread_t tid);
     77   void Cleanup();
     78 
     79   // Accessors (mostly just for printing stats)
     80   int freelist_length(size_t cl) const { return list_[cl].length(); }
     81 
     82   // Total byte size in cache
     83   size_t Size() const { return size_; }
     84 
     85   // Allocate an object of the given size and class. The size given
     86   // must be the same as the size of the class in the size map.
     87   void* Allocate(size_t size, size_t cl);
     88   void Deallocate(void* ptr, size_t size_class);
     89 
     90   void Scavenge();
     91 
     92   int GetSamplePeriod();
     93 
     94   // Record allocation of "k" bytes.  Return true iff allocation
     95   // should be sampled
     96   bool SampleAllocation(size_t k);
     97 
     98   // Record additional bytes allocated.
     99   void AddToByteAllocatedTotal(size_t k) { total_bytes_allocated_ += k; }
    100 
    101   // Return the total number of bytes allocated from this heap.  The value will
    102   // wrap when there is an overflow, and so only the differences between two
    103   // values should be relied on (and even then, modulo 2^32).
    104   uint32 GetTotalBytesAllocated() const;
    105 
    106   // On the current thread, return GetTotalBytesAllocated().
    107   static uint32 GetBytesAllocatedOnCurrentThread();
    108 
    109   static void         InitModule();
    110   static void         InitTSD();
    111   static ThreadCache* GetThreadHeap();
    112   static ThreadCache* GetCache();
    113   static ThreadCache* GetCacheIfPresent();
    114   static ThreadCache* CreateCacheIfNecessary();
    115   static void         BecomeIdle();
    116 
    117   // Return the number of thread heaps in use.
    118   static inline int HeapsInUse();
    119 
    120   // Writes to total_bytes the total number of bytes used by all thread heaps.
    121   // class_count must be an array of size kNumClasses.  Writes the number of
    122   // items on the corresponding freelist.  class_count may be NULL.
    123   // The storage of both parameters must be zero intialized.
    124   // REQUIRES: Static::pageheap_lock is held.
    125   static void GetThreadStats(uint64_t* total_bytes, uint64_t* class_count);
    126 
    127   // Sets the total thread cache size to new_size, recomputing the
    128   // individual thread cache sizes as necessary.
    129   // REQUIRES: Static::pageheap lock is held.
    130   static void set_overall_thread_cache_size(size_t new_size);
    131   static size_t overall_thread_cache_size() {
    132     return overall_thread_cache_size_;
    133   }
    134 
    135  private:
    136   class FreeList {
    137    private:
    138     void*    list_;       // Linked list of nodes
    139 
    140 #ifdef _LP64
    141     // On 64-bit hardware, manipulating 16-bit values may be slightly slow.
    142     uint32_t length_;      // Current length.
    143     uint32_t lowater_;     // Low water mark for list length.
    144     uint32_t max_length_;  // Dynamic max list length based on usage.
    145     // Tracks the number of times a deallocation has caused
    146     // length_ > max_length_.  After the kMaxOverages'th time, max_length_
    147     // shrinks and length_overages_ is reset to zero.
    148     uint32_t length_overages_;
    149 #else
    150     // If we aren't using 64-bit pointers then pack these into less space.
    151     uint16_t length_;
    152     uint16_t lowater_;
    153     uint16_t max_length_;
    154     uint16_t length_overages_;
    155 #endif
    156 
    157    public:
    158     void Init() {
    159       list_ = NULL;
    160       length_ = 0;
    161       lowater_ = 0;
    162       max_length_ = 1;
    163       length_overages_ = 0;
    164     }
    165 
    166     // Return current length of list
    167     size_t length() const {
    168       return length_;
    169     }
    170 
    171     // Return the maximum length of the list.
    172     size_t max_length() const {
    173       return max_length_;
    174     }
    175 
    176     // Set the maximum length of the list.  If 'new_max' > length(), the
    177     // client is responsible for removing objects from the list.
    178     void set_max_length(size_t new_max) {
    179       max_length_ = new_max;
    180     }
    181 
    182     // Return the number of times that length() has gone over max_length().
    183     size_t length_overages() const {
    184       return length_overages_;
    185     }
    186 
    187     void set_length_overages(size_t new_count) {
    188       length_overages_ = new_count;
    189     }
    190 
    191     // Is list empty?
    192     bool empty() const {
    193       return list_ == NULL;
    194     }
    195 
    196     // Low-water mark management
    197     int lowwatermark() const { return lowater_; }
    198     void clear_lowwatermark() { lowater_ = length_; }
    199 
    200     void Push(void* ptr) {
    201       FL_Push(&list_, ptr);
    202       length_++;
    203     }
    204 
    205     void* Pop() {
    206       ASSERT(list_ != NULL);
    207       length_--;
    208       if (length_ < lowater_) lowater_ = length_;
    209       return FL_Pop(&list_);
    210     }
    211 
    212     void* Next() {
    213       if (list_ == NULL) return NULL;
    214       return FL_Next(list_);
    215     }
    216 
    217     void PushRange(int N, void *start, void *end) {
    218       FL_PushRange(&list_, start, end);
    219       length_ += N;
    220     }
    221 
    222     void PopRange(int N, void **start, void **end) {
    223       FL_PopRange(&list_, N, start, end);
    224       ASSERT(length_ >= N);
    225       length_ -= N;
    226       if (length_ < lowater_) lowater_ = length_;
    227     }
    228   };
    229 
    230   // Gets and returns an object from the central cache, and, if possible,
    231   // also adds some objects of that size class to this thread cache.
    232   void* FetchFromCentralCache(size_t cl, size_t byte_size);
    233 
    234   // Releases some number of items from src.  Adjusts the list's max_length
    235   // to eventually converge on num_objects_to_move(cl).
    236   void ListTooLong(FreeList* src, size_t cl);
    237 
    238   // Releases N items from this thread cache.
    239   void ReleaseToCentralCache(FreeList* src, size_t cl, int N);
    240 
    241   // Increase max_size_ by reducing unclaimed_cache_space_ or by
    242   // reducing the max_size_ of some other thread.  In both cases,
    243   // the delta is kStealAmount.
    244   void IncreaseCacheLimit();
    245   // Same as above but requires Static::pageheap_lock() is held.
    246   void IncreaseCacheLimitLocked();
    247 
    248   // If TLS is available, we also store a copy of the per-thread object
    249   // in a __thread variable since __thread variables are faster to read
    250   // than pthread_getspecific().  We still need pthread_setspecific()
    251   // because __thread variables provide no way to run cleanup code when
    252   // a thread is destroyed.
    253   // We also give a hint to the compiler to use the "initial exec" TLS
    254   // model.  This is faster than the default TLS model, at the cost that
    255   // you cannot dlopen this library.  (To see the difference, look at
    256   // the CPU use of __tls_get_addr with and without this attribute.)
    257   // Since we don't really use dlopen in google code -- and using dlopen
    258   // on a malloc replacement is asking for trouble in any case -- that's
    259   // a good tradeoff for us.
    260 #ifdef HAVE_TLS
    261   static __thread ThreadCache* threadlocal_heap_
    262   // This code links against pyautolib.so, which causes dlopen() on that shared
    263   // object to fail when -fprofile-generate is used with it. Ideally
    264   // pyautolib.so should not link against this code. There is a bug filed for
    265   // that:
    266   // http://code.google.com/p/chromium/issues/detail?id=124489
    267   // For now the workaround is to pass in -DPGO_GENERATE when building Chrome
    268   // for instrumentation (-fprofile-generate).
    269   // For all non-instrumentation builds, this define will not be set and the
    270   // performance benefit of "intial-exec" will be achieved.
    271 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
    272    __attribute__ ((tls_model ("initial-exec")))
    273 # endif
    274    ;
    275 #endif
    276 
    277   // Thread-specific key.  Initialization here is somewhat tricky
    278   // because some Linux startup code invokes malloc() before it
    279   // is in a good enough state to handle pthread_keycreate().
    280   // Therefore, we use TSD keys only after tsd_inited is set to true.
    281   // Until then, we use a slow path to get the heap object.
    282   static bool tsd_inited_;
    283   static pthread_key_t heap_key_;
    284 
    285   // Linked list of heap objects.  Protected by Static::pageheap_lock.
    286   static ThreadCache* thread_heaps_;
    287   static int thread_heap_count_;
    288 
    289   // A pointer to one of the objects in thread_heaps_.  Represents
    290   // the next ThreadCache from which a thread over its max_size_ should
    291   // steal memory limit.  Round-robin through all of the objects in
    292   // thread_heaps_.  Protected by Static::pageheap_lock.
    293   static ThreadCache* next_memory_steal_;
    294 
    295   // Overall thread cache size.  Protected by Static::pageheap_lock.
    296   static size_t overall_thread_cache_size_;
    297 
    298   // Global per-thread cache size.  Writes are protected by
    299   // Static::pageheap_lock.  Reads are done without any locking, which should be
    300   // fine as long as size_t can be written atomically and we don't place
    301   // invariants between this variable and other pieces of state.
    302   static volatile size_t per_thread_cache_size_;
    303 
    304   // Represents overall_thread_cache_size_ minus the sum of max_size_
    305   // across all ThreadCaches.  Protected by Static::pageheap_lock.
    306   static ssize_t unclaimed_cache_space_;
    307 
    308   // This class is laid out with the most frequently used fields
    309   // first so that hot elements are placed on the same cache line.
    310 
    311   size_t        size_;                  // Combined size of data
    312   size_t        max_size_;              // size_ > max_size_ --> Scavenge()
    313 
    314   // The following is the tally of bytes allocated on a thread as a response to
    315   // any flavor of malloc() call.  The aggegated amount includes all padding to
    316   // the smallest class that can hold the request, or to the nearest whole page
    317   // when a large allocation is made without using a class.  This sum is
    318   // currently used for Chromium profiling, where tallies are kept of the amount
    319   // of memory allocated during the running of each task on each thread.
    320   uint32        total_bytes_allocated_;  // Total, modulo 2^32.
    321 
    322   // We sample allocations, biased by the size of the allocation
    323   Sampler       sampler_;               // A sampler
    324 
    325   FreeList      list_[kNumClasses];     // Array indexed by size-class
    326 
    327   pthread_t     tid_;                   // Which thread owns it
    328   bool          in_setspecific_;        // In call to pthread_setspecific?
    329 
    330   // Allocate a new heap. REQUIRES: Static::pageheap_lock is held.
    331   static ThreadCache* NewHeap(pthread_t tid);
    332 
    333   // Use only as pthread thread-specific destructor function.
    334   static void DestroyThreadCache(void* ptr);
    335 
    336   static void DeleteCache(ThreadCache* heap);
    337   static void RecomputePerThreadCacheSize();
    338 
    339   // Ensure that this class is cacheline-aligned. This is critical for
    340   // performance, as false sharing would negate many of the benefits
    341   // of a per-thread cache.
    342 } CACHELINE_ALIGNED;
    343 
    344 // Allocator for thread heaps
    345 // This is logically part of the ThreadCache class, but MSVC, at
    346 // least, does not like using ThreadCache as a template argument
    347 // before the class is fully defined.  So we put it outside the class.
    348 extern PageHeapAllocator<ThreadCache> threadcache_allocator;
    349 
    350 inline int ThreadCache::HeapsInUse() {
    351   return threadcache_allocator.inuse();
    352 }
    353 
    354 inline bool ThreadCache::SampleAllocation(size_t k) {
    355   return sampler_.SampleAllocation(k);
    356 }
    357 
    358 inline uint32 ThreadCache::GetTotalBytesAllocated() const {
    359   return total_bytes_allocated_;
    360 }
    361 
    362 inline void* ThreadCache::Allocate(size_t size, size_t cl) {
    363   ASSERT(size <= kMaxSize);
    364   ASSERT(size == Static::sizemap()->ByteSizeForClass(cl));
    365 
    366   FreeList* list = &list_[cl];
    367   if (list->empty()) {
    368     return FetchFromCentralCache(cl, size);
    369   }
    370   size_ -= size;
    371   return list->Pop();
    372 }
    373 
    374 inline void ThreadCache::Deallocate(void* ptr, size_t cl) {
    375   FreeList* list = &list_[cl];
    376   size_ += Static::sizemap()->ByteSizeForClass(cl);
    377   ssize_t size_headroom = max_size_ - size_ - 1;
    378 
    379   // This catches back-to-back frees of allocs in the same size
    380   // class. A more comprehensive (and expensive) test would be to walk
    381   // the entire freelist. But this might be enough to find some bugs.
    382   ASSERT(ptr != list->Next());
    383 
    384   list->Push(ptr);
    385   ssize_t list_headroom =
    386       static_cast<ssize_t>(list->max_length()) - list->length();
    387 
    388   // There are two relatively uncommon things that require further work.
    389   // In the common case we're done, and in that case we need a single branch
    390   // because of the bitwise-or trick that follows.
    391   if ((list_headroom | size_headroom) < 0) {
    392     if (list_headroom < 0) {
    393       ListTooLong(list, cl);
    394     }
    395     if (size_ >= max_size_) Scavenge();
    396   }
    397 }
    398 
    399 inline ThreadCache* ThreadCache::GetThreadHeap() {
    400 #ifdef HAVE_TLS
    401   // __thread is faster, but only when the kernel supports it
    402   if (KernelSupportsTLS())
    403     return threadlocal_heap_;
    404 #endif
    405   return reinterpret_cast<ThreadCache *>(
    406       perftools_pthread_getspecific(heap_key_));
    407 }
    408 
    409 inline ThreadCache* ThreadCache::GetCache() {
    410   ThreadCache* ptr = NULL;
    411   if (!tsd_inited_) {
    412     InitModule();
    413   } else {
    414     ptr = GetThreadHeap();
    415   }
    416   if (ptr == NULL) ptr = CreateCacheIfNecessary();
    417   return ptr;
    418 }
    419 
    420 // In deletion paths, we do not try to create a thread-cache.  This is
    421 // because we may be in the thread destruction code and may have
    422 // already cleaned up the cache for this thread.
    423 inline ThreadCache* ThreadCache::GetCacheIfPresent() {
    424   if (!tsd_inited_) return NULL;
    425   return GetThreadHeap();
    426 }
    427 
    428 }  // namespace tcmalloc
    429 
    430 #endif  // TCMALLOC_THREAD_CACHE_H_
    431