1 // Copyright (c) 2008, Google Inc. 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions are 6 // met: 7 // 8 // * Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // * Redistributions in binary form must reproduce the above 11 // copyright notice, this list of conditions and the following disclaimer 12 // in the documentation and/or other materials provided with the 13 // distribution. 14 // * Neither the name of Google Inc. nor the names of its 15 // contributors may be used to endorse or promote products derived from 16 // this software without specific prior written permission. 17 // 18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 // --- 31 // Author: Ken Ashcraft <opensource (at) google.com> 32 33 #include <config.h> 34 #include "thread_cache.h" 35 #include <errno.h> 36 #include <string.h> // for memcpy 37 #include <algorithm> // for max, min 38 #include "base/commandlineflags.h" // for SpinLockHolder 39 #include "base/spinlock.h" // for SpinLockHolder 40 #include "central_freelist.h" // for CentralFreeListPadded 41 #include "maybe_threads.h" 42 43 using std::min; 44 using std::max; 45 46 DEFINE_int64(tcmalloc_max_total_thread_cache_bytes, 47 EnvToInt64("TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES", 48 kDefaultOverallThreadCacheSize), 49 "Bound on the total amount of bytes allocated to " 50 "thread caches. This bound is not strict, so it is possible " 51 "for the cache to go over this bound in certain circumstances. " 52 "Maximum value of this flag is capped to 1 GB."); 53 54 namespace tcmalloc { 55 56 static bool phinited = false; 57 58 volatile size_t ThreadCache::per_thread_cache_size_ = kMaxThreadCacheSize; 59 size_t ThreadCache::overall_thread_cache_size_ = kDefaultOverallThreadCacheSize; 60 ssize_t ThreadCache::unclaimed_cache_space_ = kDefaultOverallThreadCacheSize; 61 PageHeapAllocator<ThreadCache> threadcache_allocator; 62 ThreadCache* ThreadCache::thread_heaps_ = NULL; 63 int ThreadCache::thread_heap_count_ = 0; 64 ThreadCache* ThreadCache::next_memory_steal_ = NULL; 65 #ifdef HAVE_TLS 66 __thread ThreadCache* ThreadCache::threadlocal_heap_ 67 // See comments in thread_cache.h about this. Bug here: 68 // http://code.google.com/p/chromium/issues/detail?id=124489 69 #if defined(HAVE___ATTRIBUTE__) && !defined(PGO_GENERATE) 70 __attribute__ ((tls_model ("initial-exec"))) 71 # endif 72 ; 73 #endif 74 bool ThreadCache::tsd_inited_ = false; 75 pthread_key_t ThreadCache::heap_key_; 76 77 #if defined(HAVE_TLS) 78 bool kernel_supports_tls = false; // be conservative 79 # if defined(_WIN32) // windows has supported TLS since winnt, I think. 80 void CheckIfKernelSupportsTLS() { 81 kernel_supports_tls = true; 82 } 83 # elif !HAVE_DECL_UNAME // if too old for uname, probably too old for TLS 84 void CheckIfKernelSupportsTLS() { 85 kernel_supports_tls = false; 86 } 87 # else 88 # include <sys/utsname.h> // DECL_UNAME checked for <sys/utsname.h> too 89 void CheckIfKernelSupportsTLS() { 90 struct utsname buf; 91 if (uname(&buf) < 0) { // should be impossible 92 Log(kLog, __FILE__, __LINE__, 93 "uname failed assuming no TLS support (errno)", errno); 94 kernel_supports_tls = false; 95 } else if (strcasecmp(buf.sysname, "linux") == 0) { 96 // The linux case: the first kernel to support TLS was 2.6.0 97 if (buf.release[0] < '2' && buf.release[1] == '.') // 0.x or 1.x 98 kernel_supports_tls = false; 99 else if (buf.release[0] == '2' && buf.release[1] == '.' && 100 buf.release[2] >= '0' && buf.release[2] < '6' && 101 buf.release[3] == '.') // 2.0 - 2.5 102 kernel_supports_tls = false; 103 else 104 kernel_supports_tls = true; 105 } else if (strcasecmp(buf.sysname, "CYGWIN_NT-6.1-WOW64") == 0) { 106 // In my testing, this version of cygwin, at least, would hang 107 // when using TLS. 108 kernel_supports_tls = false; 109 } else { // some other kernel, we'll be optimisitic 110 kernel_supports_tls = true; 111 } 112 // TODO(csilvers): VLOG(1) the tls status once we support RAW_VLOG 113 } 114 # endif // HAVE_DECL_UNAME 115 #endif // HAVE_TLS 116 117 void ThreadCache::Init(pthread_t tid) { 118 size_ = 0; 119 total_bytes_allocated_ = 0; 120 121 max_size_ = 0; 122 IncreaseCacheLimitLocked(); 123 if (max_size_ == 0) { 124 // There isn't enough memory to go around. Just give the minimum to 125 // this thread. 126 max_size_ = kMinThreadCacheSize; 127 128 // Take unclaimed_cache_space_ negative. 129 unclaimed_cache_space_ -= kMinThreadCacheSize; 130 ASSERT(unclaimed_cache_space_ < 0); 131 } 132 133 next_ = NULL; 134 prev_ = NULL; 135 tid_ = tid; 136 in_setspecific_ = false; 137 for (size_t cl = 0; cl < kNumClasses; ++cl) { 138 list_[cl].Init(); 139 } 140 141 uint32_t sampler_seed; 142 memcpy(&sampler_seed, &tid, sizeof(sampler_seed)); 143 sampler_.Init(sampler_seed); 144 } 145 146 void ThreadCache::Cleanup() { 147 // Put unused memory back into central cache 148 for (int cl = 0; cl < kNumClasses; ++cl) { 149 if (list_[cl].length() > 0) { 150 ReleaseToCentralCache(&list_[cl], cl, list_[cl].length()); 151 } 152 } 153 } 154 155 // Remove some objects of class "cl" from central cache and add to thread heap. 156 // On success, return the first object for immediate use; otherwise return NULL. 157 void* ThreadCache::FetchFromCentralCache(size_t cl, size_t byte_size) { 158 FreeList* list = &list_[cl]; 159 ASSERT(list->empty()); 160 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 161 162 const int num_to_move = min<int>(list->max_length(), batch_size); 163 void *start, *end; 164 int fetch_count = Static::central_cache()[cl].RemoveRange( 165 &start, &end, num_to_move); 166 167 ASSERT((start == NULL) == (fetch_count == 0)); 168 if (--fetch_count >= 0) { 169 size_ += byte_size * fetch_count; 170 // Pop the top of the list and add the rest to the freelist. 171 void *second = start; 172 start = FL_Pop(&second); 173 list->PushRange(fetch_count, second, end); 174 } 175 176 // Increase max length slowly up to batch_size. After that, 177 // increase by batch_size in one shot so that the length is a 178 // multiple of batch_size. 179 if (list->max_length() < batch_size) { 180 list->set_max_length(list->max_length() + 1); 181 } else { 182 // Don't let the list get too long. In 32 bit builds, the length 183 // is represented by a 16 bit int, so we need to watch out for 184 // integer overflow. 185 int new_length = min<int>(list->max_length() + batch_size, 186 kMaxDynamicFreeListLength); 187 // The list's max_length must always be a multiple of batch_size, 188 // and kMaxDynamicFreeListLength is not necessarily a multiple 189 // of batch_size. 190 new_length -= new_length % batch_size; 191 ASSERT(new_length % batch_size == 0); 192 list->set_max_length(new_length); 193 } 194 return start; 195 } 196 197 void ThreadCache::ListTooLong(FreeList* list, size_t cl) { 198 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 199 ReleaseToCentralCache(list, cl, batch_size); 200 201 // If the list is too long, we need to transfer some number of 202 // objects to the central cache. Ideally, we would transfer 203 // num_objects_to_move, so the code below tries to make max_length 204 // converge on num_objects_to_move. 205 206 if (list->max_length() < batch_size) { 207 // Slow start the max_length so we don't overreserve. 208 list->set_max_length(list->max_length() + 1); 209 } else if (list->max_length() > batch_size) { 210 // If we consistently go over max_length, shrink max_length. If we don't 211 // shrink it, some amount of memory will always stay in this freelist. 212 list->set_length_overages(list->length_overages() + 1); 213 if (list->length_overages() > kMaxOverages) { 214 ASSERT(list->max_length() > batch_size); 215 list->set_max_length(list->max_length() - batch_size); 216 list->set_length_overages(0); 217 } 218 } 219 } 220 221 // Remove some objects of class "cl" from thread heap and add to central cache 222 void ThreadCache::ReleaseToCentralCache(FreeList* src, size_t cl, int N) { 223 ASSERT(src == &list_[cl]); 224 if (N > src->length()) N = src->length(); 225 size_t delta_bytes = N * Static::sizemap()->ByteSizeForClass(cl); 226 227 // We return prepackaged chains of the correct size to the central cache. 228 // TODO: Use the same format internally in the thread caches? 229 int batch_size = Static::sizemap()->num_objects_to_move(cl); 230 while (N > batch_size) { 231 void *tail, *head; 232 src->PopRange(batch_size, &head, &tail); 233 Static::central_cache()[cl].InsertRange(head, tail, batch_size); 234 N -= batch_size; 235 } 236 void *tail, *head; 237 src->PopRange(N, &head, &tail); 238 Static::central_cache()[cl].InsertRange(head, tail, N); 239 size_ -= delta_bytes; 240 } 241 242 // Release idle memory to the central cache 243 void ThreadCache::Scavenge() { 244 // If the low-water mark for the free list is L, it means we would 245 // not have had to allocate anything from the central cache even if 246 // we had reduced the free list size by L. We aim to get closer to 247 // that situation by dropping L/2 nodes from the free list. This 248 // may not release much memory, but if so we will call scavenge again 249 // pretty soon and the low-water marks will be high on that call. 250 //int64 start = CycleClock::Now(); 251 for (int cl = 0; cl < kNumClasses; cl++) { 252 FreeList* list = &list_[cl]; 253 const int lowmark = list->lowwatermark(); 254 if (lowmark > 0) { 255 const int drop = (lowmark > 1) ? lowmark/2 : 1; 256 ReleaseToCentralCache(list, cl, drop); 257 258 // Shrink the max length if it isn't used. Only shrink down to 259 // batch_size -- if the thread was active enough to get the max_length 260 // above batch_size, it will likely be that active again. If 261 // max_length shinks below batch_size, the thread will have to 262 // go through the slow-start behavior again. The slow-start is useful 263 // mainly for threads that stay relatively idle for their entire 264 // lifetime. 265 const int batch_size = Static::sizemap()->num_objects_to_move(cl); 266 if (list->max_length() > batch_size) { 267 list->set_max_length( 268 max<int>(list->max_length() - batch_size, batch_size)); 269 } 270 } 271 list->clear_lowwatermark(); 272 } 273 274 IncreaseCacheLimit(); 275 } 276 277 void ThreadCache::IncreaseCacheLimit() { 278 SpinLockHolder h(Static::pageheap_lock()); 279 IncreaseCacheLimitLocked(); 280 } 281 282 void ThreadCache::IncreaseCacheLimitLocked() { 283 if (unclaimed_cache_space_ > 0) { 284 // Possibly make unclaimed_cache_space_ negative. 285 unclaimed_cache_space_ -= kStealAmount; 286 max_size_ += kStealAmount; 287 return; 288 } 289 // Don't hold pageheap_lock too long. Try to steal from 10 other 290 // threads before giving up. The i < 10 condition also prevents an 291 // infinite loop in case none of the existing thread heaps are 292 // suitable places to steal from. 293 for (int i = 0; i < 10; 294 ++i, next_memory_steal_ = next_memory_steal_->next_) { 295 // Reached the end of the linked list. Start at the beginning. 296 if (next_memory_steal_ == NULL) { 297 ASSERT(thread_heaps_ != NULL); 298 next_memory_steal_ = thread_heaps_; 299 } 300 if (next_memory_steal_ == this || 301 next_memory_steal_->max_size_ <= kMinThreadCacheSize) { 302 continue; 303 } 304 next_memory_steal_->max_size_ -= kStealAmount; 305 max_size_ += kStealAmount; 306 307 next_memory_steal_ = next_memory_steal_->next_; 308 return; 309 } 310 } 311 312 int ThreadCache::GetSamplePeriod() { 313 return sampler_.GetSamplePeriod(); 314 } 315 316 // static 317 unsigned int ThreadCache::GetBytesAllocatedOnCurrentThread() { 318 return ThreadCache::GetThreadHeap()->GetTotalBytesAllocated(); 319 } 320 321 void ThreadCache::InitModule() { 322 SpinLockHolder h(Static::pageheap_lock()); 323 if (!phinited) { 324 Static::InitStaticVars(); 325 threadcache_allocator.Init(); 326 phinited = 1; 327 } 328 } 329 330 void ThreadCache::InitTSD() { 331 ASSERT(!tsd_inited_); 332 perftools_pthread_key_create(&heap_key_, DestroyThreadCache); 333 tsd_inited_ = true; 334 335 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY 336 // We may have used a fake pthread_t for the main thread. Fix it. 337 pthread_t zero; 338 memset(&zero, 0, sizeof(zero)); 339 SpinLockHolder h(Static::pageheap_lock()); 340 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 341 if (h->tid_ == zero) { 342 h->tid_ = pthread_self(); 343 } 344 } 345 #endif 346 } 347 348 ThreadCache* ThreadCache::CreateCacheIfNecessary() { 349 // Initialize per-thread data if necessary 350 ThreadCache* heap = NULL; 351 { 352 SpinLockHolder h(Static::pageheap_lock()); 353 // On some old glibc's, and on freebsd's libc (as of freebsd 8.1), 354 // calling pthread routines (even pthread_self) too early could 355 // cause a segfault. Since we can call pthreads quite early, we 356 // have to protect against that in such situations by making a 357 // 'fake' pthread. This is not ideal since it doesn't work well 358 // when linking tcmalloc statically with apps that create threads 359 // before main, so we only do it if we have to. 360 #ifdef PTHREADS_CRASHES_IF_RUN_TOO_EARLY 361 pthread_t me; 362 if (!tsd_inited_) { 363 memset(&me, 0, sizeof(me)); 364 } else { 365 me = pthread_self(); 366 } 367 #else 368 const pthread_t me = pthread_self(); 369 #endif 370 371 // This may be a recursive malloc call from pthread_setspecific() 372 // In that case, the heap for this thread has already been created 373 // and added to the linked list. So we search for that first. 374 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 375 if (h->tid_ == me) { 376 heap = h; 377 break; 378 } 379 } 380 381 if (heap == NULL) heap = NewHeap(me); 382 } 383 384 // We call pthread_setspecific() outside the lock because it may 385 // call malloc() recursively. We check for the recursive call using 386 // the "in_setspecific_" flag so that we can avoid calling 387 // pthread_setspecific() if we are already inside pthread_setspecific(). 388 if (!heap->in_setspecific_ && tsd_inited_) { 389 heap->in_setspecific_ = true; 390 perftools_pthread_setspecific(heap_key_, heap); 391 #ifdef HAVE_TLS 392 // Also keep a copy in __thread for faster retrieval 393 threadlocal_heap_ = heap; 394 #endif 395 heap->in_setspecific_ = false; 396 } 397 return heap; 398 } 399 400 ThreadCache* ThreadCache::NewHeap(pthread_t tid) { 401 // Create the heap and add it to the linked list 402 ThreadCache *heap = threadcache_allocator.New(); 403 heap->Init(tid); 404 heap->next_ = thread_heaps_; 405 heap->prev_ = NULL; 406 if (thread_heaps_ != NULL) { 407 thread_heaps_->prev_ = heap; 408 } else { 409 // This is the only thread heap at the momment. 410 ASSERT(next_memory_steal_ == NULL); 411 next_memory_steal_ = heap; 412 } 413 thread_heaps_ = heap; 414 thread_heap_count_++; 415 return heap; 416 } 417 418 void ThreadCache::BecomeIdle() { 419 if (!tsd_inited_) return; // No caches yet 420 ThreadCache* heap = GetThreadHeap(); 421 if (heap == NULL) return; // No thread cache to remove 422 if (heap->in_setspecific_) return; // Do not disturb the active caller 423 424 heap->in_setspecific_ = true; 425 perftools_pthread_setspecific(heap_key_, NULL); 426 #ifdef HAVE_TLS 427 // Also update the copy in __thread 428 threadlocal_heap_ = NULL; 429 #endif 430 heap->in_setspecific_ = false; 431 if (GetThreadHeap() == heap) { 432 // Somehow heap got reinstated by a recursive call to malloc 433 // from pthread_setspecific. We give up in this case. 434 return; 435 } 436 437 // We can now get rid of the heap 438 DeleteCache(heap); 439 } 440 441 void ThreadCache::DestroyThreadCache(void* ptr) { 442 // Note that "ptr" cannot be NULL since pthread promises not 443 // to invoke the destructor on NULL values, but for safety, 444 // we check anyway. 445 if (ptr == NULL) return; 446 #ifdef HAVE_TLS 447 // Prevent fast path of GetThreadHeap() from returning heap. 448 threadlocal_heap_ = NULL; 449 #endif 450 DeleteCache(reinterpret_cast<ThreadCache*>(ptr)); 451 } 452 453 void ThreadCache::DeleteCache(ThreadCache* heap) { 454 // Remove all memory from heap 455 heap->Cleanup(); 456 457 // Remove from linked list 458 SpinLockHolder h(Static::pageheap_lock()); 459 if (heap->next_ != NULL) heap->next_->prev_ = heap->prev_; 460 if (heap->prev_ != NULL) heap->prev_->next_ = heap->next_; 461 if (thread_heaps_ == heap) thread_heaps_ = heap->next_; 462 thread_heap_count_--; 463 464 if (next_memory_steal_ == heap) next_memory_steal_ = heap->next_; 465 if (next_memory_steal_ == NULL) next_memory_steal_ = thread_heaps_; 466 unclaimed_cache_space_ += heap->max_size_; 467 468 threadcache_allocator.Delete(heap); 469 } 470 471 void ThreadCache::RecomputePerThreadCacheSize() { 472 // Divide available space across threads 473 int n = thread_heap_count_ > 0 ? thread_heap_count_ : 1; 474 size_t space = overall_thread_cache_size_ / n; 475 476 // Limit to allowed range 477 if (space < kMinThreadCacheSize) space = kMinThreadCacheSize; 478 if (space > kMaxThreadCacheSize) space = kMaxThreadCacheSize; 479 480 double ratio = space / max<double>(1, per_thread_cache_size_); 481 size_t claimed = 0; 482 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 483 // Increasing the total cache size should not circumvent the 484 // slow-start growth of max_size_. 485 if (ratio < 1.0) { 486 h->max_size_ = static_cast<size_t>(h->max_size_ * ratio); 487 } 488 claimed += h->max_size_; 489 } 490 unclaimed_cache_space_ = overall_thread_cache_size_ - claimed; 491 per_thread_cache_size_ = space; 492 } 493 494 void ThreadCache::GetThreadStats(uint64_t* total_bytes, uint64_t* class_count) { 495 for (ThreadCache* h = thread_heaps_; h != NULL; h = h->next_) { 496 *total_bytes += h->Size(); 497 if (class_count) { 498 for (int cl = 0; cl < kNumClasses; ++cl) { 499 class_count[cl] += h->freelist_length(cl); 500 } 501 } 502 } 503 } 504 505 void ThreadCache::set_overall_thread_cache_size(size_t new_size) { 506 // Clip the value to a reasonable range 507 if (new_size < kMinThreadCacheSize) new_size = kMinThreadCacheSize; 508 if (new_size > (1<<30)) new_size = (1<<30); // Limit to 1GB 509 overall_thread_cache_size_ = new_size; 510 511 RecomputePerThreadCacheSize(); 512 } 513 514 } // namespace tcmalloc 515