Home | History | Annotate | Download | only in src
      1 // Copyright (c) 2008, Google Inc.
      2 // All rights reserved.
      3 //
      4 // Redistribution and use in source and binary forms, with or without
      5 // modification, are permitted provided that the following conditions are
      6 // met:
      7 //
      8 //     * Redistributions of source code must retain the above copyright
      9 // notice, this list of conditions and the following disclaimer.
     10 //     * Redistributions in binary form must reproduce the above
     11 // copyright notice, this list of conditions and the following disclaimer
     12 // in the documentation and/or other materials provided with the
     13 // distribution.
     14 //     * Neither the name of Google Inc. nor the names of its
     15 // contributors may be used to endorse or promote products derived from
     16 // this software without specific prior written permission.
     17 //
     18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 // ---
     31 // Author: Ken Ashcraft <opensource (at) google.com>
     32 
     33 #include <config.h>
     34 #include "thread_cache.h"
     35 #include <errno.h>
     36 #include <string.h>                     // for memcpy
     37 #include <algorithm>                    // for max, min
     38 #include "base/commandlineflags.h"      // for SpinLockHolder
     39 #include "base/spinlock.h"              // for SpinLockHolder
     40 #include "central_freelist.h"           // for CentralFreeListPadded
     41 #include "maybe_threads.h"
     42 
     43 using std::min;
     44 using std::max;
     45 
     46 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes,
     47              EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES",
     48                         kDefaultOverallThreadCacheSize),
     49              "Bound on the total amount of bytes allocated to "
     50              "thread caches. This bound is not strict, so it is possible "
     51              "for the cache to go over this bound in certain circumstances. "
     52              "Maximum value of this flag is capped to 1 GB.");
     53 
     54 namespace tcmalloc {
     55 
     56 static bool phinited = false;
     57 
     58 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize;
     59 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize;
     60 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize;
     61 PageHeapAllocator<ThreadCache> threadcache_allocator;
     62 ThreadCache* ThreadCache::thread_heaps_ = NULL;
     63 int ThreadCache::thread_heap_count_ = 0;
     64 ThreadCache* ThreadCache::next_memory_steal_ = NULL;
     65 #ifdef HAVE_TLS
     66 __thread ThreadCache* ThreadCache::threadlocal_heap_
     67 // See comments in thread_cache.h about this. Bug here:
     68 // http://code.google.com/p/chromium/issues/detail?id=124489
     69 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE)
     70    __attribute__ ((tls_model ("initial-exec")))
     71 # endif
     72    ;
     73 #endif
     74 bool ThreadCache::tsd_inited_ = false;
     75 pthread_key_t ThreadCache::heap_key_;
     76 
     77 #if defined(HAVE_TLS)
     78 bool kernel_supports_tls = false;      // be conservative
     79 # if defined(_WIN32)    // windows has supported TLS since winnt, I think.
     80     void CheckIfKernelSupportsTLS() {
     81       kernel_supports_tls = true;
     82     }
     83 # elif !HAVE_DECL_UNAME    // if too old for uname, probably too old for TLS
     84     void CheckIfKernelSupportsTLS() {
     85       kernel_supports_tls = false;
     86     }
     87 # else
     88 #   include <sys/utsname.h>    // DECL_UNAME checked for <sys/utsname.h> too
     89     void CheckIfKernelSupportsTLS() {
     90       struct utsname buf;
     91       if (uname(&buf) < 0) {   // should be impossible
     92         Log(kLog, __FILE__, __LINE__,
     93             "uname failed assuming no TLS support (errno)", errno);
     94         kernel_supports_tls = false;
     95       } else if (strcasecmp(buf.sysname, "linux") == 0) {
     96         // The linux case: the first kernel to support TLS was 2.6.0
     97         if (buf.release[0] < '2' && buf.release[1] == '.')    // 0.x or 1.x
     98           kernel_supports_tls = false;
     99         else if (buf.release[0] == '2' && buf.release[1] == '.' &&
    100                  buf.release[2] >= '0' && buf.release[2] < '6' &&
    101                  buf.release[3] == '.')                       // 2.0 - 2.5
    102           kernel_supports_tls = false;
    103         else
    104           kernel_supports_tls = true;
    105       } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) {
    106         // In my testing, this version of cygwin, at least, would hang
    107         // when using TLS.
    108         kernel_supports_tls = false;
    109       } else {        // some other kernel, we'll be optimisitic
    110         kernel_supports_tls = true;
    111       }
    112       // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG
    113     }
    114 #  endif  // HAVE_DECL_UNAME
    115 #endif    // HAVE_TLS
    116 
    117 void ThreadCache::Init(pthread_t tid) {
    118   size_ = 0;
    119   total_bytes_allocated_ = 0;
    120 
    121   max_size_ = 0;
    122   IncreaseCacheLimitLocked();
    123   if (max_size_ == 0) {
    124     // There isn't enough memory to go around.  Just give the minimum to
    125     // this thread.
    126     max_size_ = kMinThreadCacheSize;
    127 
    128     // Take unclaimed_cache_space_ negative.
    129     unclaimed_cache_space_ -= kMinThreadCacheSize;
    130     ASSERT(unclaimed_cache_space_ < 0);
    131   }
    132 
    133   next_ = NULL;
    134   prev_ = NULL;
    135   tid_  = tid;
    136   in_setspecific_ = false;
    137   for (size_t cl = 0; cl < kNumClasses; ++cl) {
    138     list_[cl].Init();
    139   }
    140 
    141   uint32_t sampler_seed;
    142   memcpy(&sampler_seed, &tid, sizeof(sampler_seed));
    143   sampler_.Init(sampler_seed);
    144 }
    145 
    146 void ThreadCache::Cleanup() {
    147   // Put unused memory back into central cache
    148   for (int cl = 0; cl < kNumClasses; ++cl) {
    149     if (list_[cl].length() > 0) {
    150       ReleaseToCentralCache(&list_[cl], cl, list_[cl].length());
    151     }
    152   }
    153 }
    154 
    155 // Remove some objects of class "cl" from central cache and add to thread heap.
    156 // On success, return the first object for immediate use; otherwise return NULL.
    157 void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) {
    158   FreeList* list = &list_[cl];
    159   ASSERT(list->empty());
    160   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
    161 
    162   const int num_to_move = min<int>(list->max_length(), batch_size);
    163   void *start, *end;
    164   int fetch_count = Static::central_cache()[cl].RemoveRange(
    165       &start, &end, num_to_move);
    166 
    167   ASSERT((start == NULL) == (fetch_count == 0));
    168   if (--fetch_count >= 0) {
    169     size_ += byte_size * fetch_count;
    170     // Pop the top of the list and add the rest to the freelist.
    171     void *second = start;
    172     start = FL_Pop(&second);
    173     list->PushRange(fetch_count, second, end);
    174   }
    175 
    176   // Increase max length slowly up to batch_size.  After that,
    177   // increase by batch_size in one shot so that the length is a
    178   // multiple of batch_size.
    179   if (list->max_length() < batch_size) {
    180     list->set_max_length(list->max_length() + 1);
    181   } else {
    182     // Don't let the list get too long.  In 32 bit builds, the length
    183     // is represented by a 16 bit int, so we need to watch out for
    184     // integer overflow.
    185     int new_length = min<int>(list->max_length() + batch_size,
    186                               kMaxDynamicFreeListLength);
    187     // The list's max_length must always be a multiple of batch_size,
    188     // and kMaxDynamicFreeListLength is not necessarily a multiple
    189     // of batch_size.
    190     new_length -= new_length % batch_size;
    191     ASSERT(new_length % batch_size == 0);
    192     list->set_max_length(new_length);
    193   }
    194   return start;
    195 }
    196 
    197 void ThreadCache::ListTooLong(FreeList* list, size_t cl) {
    198   const int batch_size = Static::sizemap()->num_objects_to_move(cl);
    199   ReleaseToCentralCache(list, cl, batch_size);
    200 
    201   // If the list is too long, we need to transfer some number of
    202   // objects to the central cache.  Ideally, we would transfer
    203   // num_objects_to_move, so the code below tries to make max_length
    204   // converge on num_objects_to_move.
    205 
    206   if (list->max_length() < batch_size) {
    207     // Slow start the max_length so we don't overreserve.
    208     list->set_max_length(list->max_length() + 1);
    209   } else if (list->max_length() > batch_size) {
    210     // If we consistently go over max_length, shrink max_length.  If we don't
    211     // shrink it, some amount of memory will always stay in this freelist.
    212     list->set_length_overages(list->length_overages() + 1);
    213     if (list->length_overages() > kMaxOverages) {
    214       ASSERT(list->max_length() > batch_size);
    215       list->set_max_length(list->max_length() - batch_size);
    216       list->set_length_overages(0);
    217     }
    218   }
    219 }
    220 
    221 // Remove some objects of class "cl" from thread heap and add to central cache
    222 void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) {
    223   ASSERT(src == &list_[cl]);
    224   if (N > src->length()) N = src->length();
    225   size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl);
    226 
    227   // We return prepackaged chains of the correct size to the central cache.
    228   // TODO: Use the same format internally in the thread caches?
    229   int batch_size = Static::sizemap()->num_objects_to_move(cl);
    230   while (N > batch_size) {
    231     void *tail, *head;
    232     src->PopRange(batch_size, &head, &tail);
    233     Static::central_cache()[cl].InsertRange(head, tail, batch_size);
    234     N -= batch_size;
    235   }
    236   void *tail, *head;
    237   src->PopRange(N, &head, &tail);
    238   Static::central_cache()[cl].InsertRange(head, tail, N);
    239   size_ -= delta_bytes;
    240 }
    241 
    242 // Release idle memory to the central cache
    243 void ThreadCache::Scavenge() {
    244   // If the low-water mark for the free list is L, it means we would
    245   // not have had to allocate anything from the central cache even if
    246   // we had reduced the free list size by L.  We aim to get closer to
    247   // that situation by dropping L/2 nodes from the free list.  This
    248   // may not release much memory, but if so we will call scavenge again
    249   // pretty soon and the low-water marks will be high on that call.
    250   //int64 start = CycleClock::Now();
    251   for (int cl = 0; cl < kNumClasses; cl++) {
    252     FreeList* list = &list_[cl];
    253     const int lowmark = list->lowwatermark();
    254     if (lowmark > 0) {
    255       const int drop = (lowmark > 1) ? lowmark/2 : 1;
    256       ReleaseToCentralCache(list, cl, drop);
    257 
    258       // Shrink the max length if it isn't used.  Only shrink down to
    259       // batch_size -- if the thread was active enough to get the max_length
    260       // above batch_size, it will likely be that active again.  If
    261       // max_length shinks below batch_size, the thread will have to
    262       // go through the slow-start behavior again.  The slow-start is useful
    263       // mainly for threads that stay relatively idle for their entire
    264       // lifetime.
    265       const int batch_size = Static::sizemap()->num_objects_to_move(cl);
    266       if (list->max_length() > batch_size) {
    267         list->set_max_length(
    268             max<int>(list->max_length() - batch_size, batch_size));
    269       }
    270     }
    271     list->clear_lowwatermark();
    272   }
    273 
    274   IncreaseCacheLimit();
    275 }
    276 
    277 void ThreadCache::IncreaseCacheLimit() {
    278   SpinLockHolder h(Static::pageheap_lock());
    279   IncreaseCacheLimitLocked();
    280 }
    281 
    282 void ThreadCache::IncreaseCacheLimitLocked() {
    283   if (unclaimed_cache_space_ > 0) {
    284     // Possibly make unclaimed_cache_space_ negative.
    285     unclaimed_cache_space_ -= kStealAmount;
    286     max_size_ += kStealAmount;
    287     return;
    288   }
    289   // Don't hold pageheap_lock too long.  Try to steal from 10 other
    290   // threads before giving up.  The i < 10 condition also prevents an
    291   // infinite loop in case none of the existing thread heaps are
    292   // suitable places to steal from.
    293   for (int i = 0; i < 10;
    294        ++i, next_memory_steal_ = next_memory_steal_->next_) {
    295     // Reached the end of the linked list.  Start at the beginning.
    296     if (next_memory_steal_ == NULL) {
    297       ASSERT(thread_heaps_ != NULL);
    298       next_memory_steal_ = thread_heaps_;
    299     }
    300     if (next_memory_steal_ == this ||
    301         next_memory_steal_->max_size_ <= kMinThreadCacheSize) {
    302       continue;
    303     }
    304     next_memory_steal_->max_size_ -= kStealAmount;
    305     max_size_ += kStealAmount;
    306 
    307     next_memory_steal_ = next_memory_steal_->next_;
    308     return;
    309   }
    310 }
    311 
    312 int ThreadCache::GetSamplePeriod() {
    313   return sampler_.GetSamplePeriod();
    314 }
    315 
    316 // static
    317 unsigned int ThreadCache::GetBytesAllocatedOnCurrentThread() {
    318   return ThreadCache::GetThreadHeap()->GetTotalBytesAllocated();
    319 }
    320 
    321 void ThreadCache::InitModule() {
    322   SpinLockHolder h(Static::pageheap_lock());
    323   if (!phinited) {
    324     Static::InitStaticVars();
    325     threadcache_allocator.Init();
    326     phinited = 1;
    327   }
    328 }
    329 
    330 void ThreadCache::InitTSD() {
    331   ASSERT(!tsd_inited_);
    332   perftools_pthread_key_create(&heap_key_, DestroyThreadCache);
    333   tsd_inited_ = true;
    334 
    335 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
    336   // We may have used a fake pthread_t for the main thread.  Fix it.
    337   pthread_t zero;
    338   memset(&zero, 0, sizeof(zero));
    339   SpinLockHolder h(Static::pageheap_lock());
    340   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
    341     if (h->tid_ == zero) {
    342       h->tid_ = pthread_self();
    343     }
    344   }
    345 #endif
    346 }
    347 
    348 ThreadCache* ThreadCache::CreateCacheIfNecessary() {
    349   // Initialize per-thread data if necessary
    350   ThreadCache* heap = NULL;
    351   {
    352     SpinLockHolder h(Static::pageheap_lock());
    353     // On some old glibc's, and on freebsd's libc (as of freebsd 8.1),
    354     // calling pthread routines (even pthread_self) too early could
    355     // cause a segfault.  Since we can call pthreads quite early, we
    356     // have to protect against that in such situations by making a
    357     // 'fake' pthread.  This is not ideal since it doesn't work well
    358     // when linking tcmalloc statically with apps that create threads
    359     // before main, so we only do it if we have to.
    360 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY
    361     pthread_t me;
    362     if (!tsd_inited_) {
    363       memset(&me, 0, sizeof(me));
    364     } else {
    365       me = pthread_self();
    366     }
    367 #else
    368     const pthread_t me = pthread_self();
    369 #endif
    370 
    371     // This may be a recursive malloc call from pthread_setspecific()
    372     // In that case, the heap for this thread has already been created
    373     // and added to the linked list.  So we search for that first.
    374     for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
    375       if (h->tid_ == me) {
    376         heap = h;
    377         break;
    378       }
    379     }
    380 
    381     if (heap == NULL) heap = NewHeap(me);
    382   }
    383 
    384   // We call pthread_setspecific() outside the lock because it may
    385   // call malloc() recursively.  We check for the recursive call using
    386   // the "in_setspecific_" flag so that we can avoid calling
    387   // pthread_setspecific() if we are already inside pthread_setspecific().
    388   if (!heap->in_setspecific_ && tsd_inited_) {
    389     heap->in_setspecific_ = true;
    390     perftools_pthread_setspecific(heap_key_, heap);
    391 #ifdef HAVE_TLS
    392     // Also keep a copy in __thread for faster retrieval
    393     threadlocal_heap_ = heap;
    394 #endif
    395     heap->in_setspecific_ = false;
    396   }
    397   return heap;
    398 }
    399 
    400 ThreadCache* ThreadCache::NewHeap(pthread_t tid) {
    401   // Create the heap and add it to the linked list
    402   ThreadCache *heap = threadcache_allocator.New();
    403   heap->Init(tid);
    404   heap->next_ = thread_heaps_;
    405   heap->prev_ = NULL;
    406   if (thread_heaps_ != NULL) {
    407     thread_heaps_->prev_ = heap;
    408   } else {
    409     // This is the only thread heap at the momment.
    410     ASSERT(next_memory_steal_ == NULL);
    411     next_memory_steal_ = heap;
    412   }
    413   thread_heaps_ = heap;
    414   thread_heap_count_++;
    415   return heap;
    416 }
    417 
    418 void ThreadCache::BecomeIdle() {
    419   if (!tsd_inited_) return;              // No caches yet
    420   ThreadCache* heap = GetThreadHeap();
    421   if (heap == NULL) return;             // No thread cache to remove
    422   if (heap->in_setspecific_) return;    // Do not disturb the active caller
    423 
    424   heap->in_setspecific_ = true;
    425   perftools_pthread_setspecific(heap_key_, NULL);
    426 #ifdef HAVE_TLS
    427   // Also update the copy in __thread
    428   threadlocal_heap_ = NULL;
    429 #endif
    430   heap->in_setspecific_ = false;
    431   if (GetThreadHeap() == heap) {
    432     // Somehow heap got reinstated by a recursive call to malloc
    433     // from pthread_setspecific.  We give up in this case.
    434     return;
    435   }
    436 
    437   // We can now get rid of the heap
    438   DeleteCache(heap);
    439 }
    440 
    441 void ThreadCache::DestroyThreadCache(void* ptr) {
    442   // Note that "ptr" cannot be NULL since pthread promises not
    443   // to invoke the destructor on NULL values, but for safety,
    444   // we check anyway.
    445   if (ptr == NULL) return;
    446 #ifdef HAVE_TLS
    447   // Prevent fast path of GetThreadHeap() from returning heap.
    448   threadlocal_heap_ = NULL;
    449 #endif
    450   DeleteCache(reinterpret_cast<ThreadCache*>(ptr));
    451 }
    452 
    453 void ThreadCache::DeleteCache(ThreadCache* heap) {
    454   // Remove all memory from heap
    455   heap->Cleanup();
    456 
    457   // Remove from linked list
    458   SpinLockHolder h(Static::pageheap_lock());
    459   if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_;
    460   if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_;
    461   if (thread_heaps_ == heap) thread_heaps_ = heap->next_;
    462   thread_heap_count_--;
    463 
    464   if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_;
    465   if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_;
    466   unclaimed_cache_space_ += heap->max_size_;
    467 
    468   threadcache_allocator.Delete(heap);
    469 }
    470 
    471 void ThreadCache::RecomputePerThreadCacheSize() {
    472   // Divide available space across threads
    473   int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1;
    474   size_t space = overall_thread_cache_size_ / n;
    475 
    476   // Limit to allowed range
    477   if (space < kMinThreadCacheSize) space = kMinThreadCacheSize;
    478   if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize;
    479 
    480   double ratio = space / max<double>(1, per_thread_cache_size_);
    481   size_t claimed = 0;
    482   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
    483     // Increasing the total cache size should not circumvent the
    484     // slow-start growth of max_size_.
    485     if (ratio < 1.0) {
    486         h->max_size_ = static_cast<size_t>(h->max_size_ * ratio);
    487     }
    488     claimed += h->max_size_;
    489   }
    490   unclaimed_cache_space_ = overall_thread_cache_size_ - claimed;
    491   per_thread_cache_size_ = space;
    492 }
    493 
    494 void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) {
    495   for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) {
    496     *total_bytes += h->Size();
    497     if (class_count) {
    498       for (int cl = 0; cl < kNumClasses; ++cl) {
    499         class_count[cl] += h->freelist_length(cl);
    500       }
    501     }
    502   }
    503 }
    504 
    505 void ThreadCache::set_overall_thread_cache_size(size_t new_size) {
    506   // Clip the value to a reasonable range
    507   if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize;
    508   if (new_size > (1<<30)) new_size = (1<<30);     // Limit to 1GB
    509   overall_thread_cache_size_ = new_size;
    510 
    511   RecomputePerThreadCacheSize();
    512 }
    513 
    514 }  // namespace tcmalloc
    515