1 // Copyright 2014 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This is a OS-independent* module which purpose is tracking allocations and 6 // their call sites (stack traces). It is able to deal with hole punching 7 // (read: munmap). Also, it has low overhead and its presence in the system its 8 // barely noticeable, even if tracing *all* the processes. 9 // This module does NOT know how to deal with stack unwinding. The caller must 10 // do that and pass the addresses of the unwound stack. 11 // * (Modulo three lines for mutexes.) 12 // 13 // Exposed API: 14 // void heap_profiler_init(HeapStats*); 15 // void heap_profiler_alloc(addr, size, stack_frames, depth, flags); 16 // void heap_profiler_free(addr, size); (size == 0 means free entire region). 17 // 18 // The profiling information is tracked into two data structures: 19 // 1) A RB-Tree of non-overlapping VM regions (allocs) sorted by their start 20 // addr. Each entry tracks the start-end addresses and points to the stack 21 // trace which created that allocation (see below). 22 // 2) A (hash) table of stack traces. In general the #allocations >> #call sites 23 // which create those allocations. In order to avoid duplicating the latter, 24 // they are stored distinctly in this hash table and used by reference. 25 // 26 // / Process virtual address space \ 27 // +------+ +------+ +------+ 28 // |Alloc1| |Alloc2| |Alloc3| <- Allocs (a RB-Tree underneath) 29 // +------+ +------+ +------+ 30 // Len: 12 Len: 4 Len: 4 31 // | | | stack_traces 32 // | | | +-----------+--------------+ 33 // | | | | Alloc tot | stack frames + 34 // | | | +-----------+--------------+ 35 // +------------|-------------+------------> | 16 | 0x1234 .... | 36 // | +-----------+--------------+ 37 // +--------------------------> | 4 | 0x5678 .... | 38 // +-----------+--------------+ 39 // (A hash-table underneath) 40 // 41 // Final note: the memory for both 1) and 2) entries is carved out from two 42 // static pools (i.e. stack_traces and allocs). The pools are treated as 43 // a sbrk essentially, and are kept compact by reusing freed elements (hence 44 // having a freelist for each of them). 45 // 46 // All the internal (static) functions here assume that the |lock| is held. 47 48 #include <assert.h> 49 #include <string.h> 50 51 // Platform-dependent mutex boilerplate. 52 #if defined(__linux__) || defined(__ANDROID__) 53 #include <pthread.h> 54 #define DEFINE_MUTEX(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER 55 #define LOCK_MUTEX(x) pthread_mutex_lock(&x) 56 #define UNLOCK_MUTEX(x) pthread_mutex_unlock(&x) 57 #else 58 #error OS not supported. 59 #endif 60 61 #include "tools/android/heap_profiler/heap_profiler.h" 62 63 64 static DEFINE_MUTEX(lock); 65 66 // |stats| contains the global tracking metadata and is the entry point which 67 // is read by the heap_dump tool. 68 static HeapStats* stats; 69 70 // +---------------------------------------------------------------------------+ 71 // + Stack traces hash-table + 72 // +---------------------------------------------------------------------------+ 73 #define ST_ENTRIES_MAX (64 * 1024) 74 #define ST_HASHTABLE_BUCKETS (64 * 1024) /* Must be a power of 2. */ 75 76 static StacktraceEntry stack_traces[ST_ENTRIES_MAX]; 77 static StacktraceEntry* stack_traces_freelist; 78 static StacktraceEntry* stack_traces_ht[ST_HASHTABLE_BUCKETS]; 79 80 // Looks up a stack trace from the stack frames. Creates a new one if necessary. 81 static StacktraceEntry* record_stacktrace(uintptr_t* frames, uint32_t depth) { 82 if (depth == 0) 83 return NULL; 84 85 if (depth > HEAP_PROFILER_MAX_DEPTH) 86 depth = HEAP_PROFILER_MAX_DEPTH; 87 88 uint32_t i; 89 uintptr_t hash = 0; 90 for (i = 0; i < depth; ++i) 91 hash = (hash << 1) ^ (frames[i]); 92 const uint32_t slot = hash & (ST_HASHTABLE_BUCKETS - 1); 93 StacktraceEntry* st = stack_traces_ht[slot]; 94 95 // Look for an existing entry in the hash-table. 96 const size_t frames_length = depth * sizeof(uintptr_t); 97 while (st != NULL && st->hash != hash && 98 memcmp(frames, st->frames, frames_length) != 0) { 99 st = st->next; 100 } 101 102 // If not found, create a new one from the stack_traces array and add it to 103 // the hash-table. 104 if (st == NULL) { 105 // Get a free element either from the freelist or from the pool. 106 if (stack_traces_freelist != NULL) { 107 st = stack_traces_freelist; 108 stack_traces_freelist = stack_traces_freelist->next; 109 } else if (stats->max_stack_traces < ST_ENTRIES_MAX) { 110 st = &stack_traces[stats->max_stack_traces]; 111 ++stats->max_stack_traces; 112 } else { 113 return NULL; 114 } 115 116 memset(st, 0, sizeof(*st)); 117 memcpy(st->frames, frames, frames_length); 118 st->hash = hash; 119 st->next = stack_traces_ht[slot]; 120 stack_traces_ht[slot] = st; 121 ++stats->num_stack_traces; 122 } 123 124 return st; 125 } 126 127 // Frees up a stack trace and appends it to the corresponding freelist. 128 static void free_stacktrace(StacktraceEntry* st) { 129 assert(st->alloc_bytes == 0); 130 const uint32_t slot = st->hash & (ST_HASHTABLE_BUCKETS - 1); 131 132 // The expected load factor of the hash-table is very low. Frees should be 133 // pretty rare. Hence don't bother with a doubly linked list, might cost more. 134 StacktraceEntry** prev = &stack_traces_ht[slot]; 135 while (*prev != st) 136 prev = &((*prev)->next); 137 138 // Remove from the hash-table bucket. 139 assert(*prev == st); 140 *prev = st->next; 141 142 // Add to the freelist. 143 st->next = stack_traces_freelist; 144 stack_traces_freelist = st; 145 --stats->num_stack_traces; 146 } 147 148 // +---------------------------------------------------------------------------+ 149 // + Allocs RB-tree + 150 // +---------------------------------------------------------------------------+ 151 #define ALLOCS_ENTRIES_MAX (256 * 1024) 152 153 static Alloc allocs[ALLOCS_ENTRIES_MAX]; 154 static Alloc* allocs_freelist; 155 static RB_HEAD(HeapEntriesTree, Alloc) allocs_tree = 156 RB_INITIALIZER(&allocs_tree); 157 158 // Comparator used by the RB-Tree (mind the overflow, avoid arith on addresses). 159 static int allocs_tree_cmp(Alloc *alloc_1, Alloc *alloc_2) { 160 if (alloc_1->start < alloc_2->start) 161 return -1; 162 if (alloc_1->start > alloc_2->start) 163 return 1; 164 return 0; 165 } 166 167 RB_PROTOTYPE(HeapEntriesTree, Alloc, rb_node, allocs_tree_cmp); 168 RB_GENERATE(HeapEntriesTree, Alloc, rb_node, allocs_tree_cmp); 169 170 // Allocates a new Alloc and inserts it in the tree. 171 static Alloc* insert_alloc( 172 uintptr_t start, uintptr_t end, StacktraceEntry* st, uint32_t flags) { 173 Alloc* alloc = NULL; 174 175 // First of all, get a free element either from the freelist or from the pool. 176 if (allocs_freelist != NULL) { 177 alloc = allocs_freelist; 178 allocs_freelist = alloc->next_free; 179 } else if (stats->max_allocs < ALLOCS_ENTRIES_MAX) { 180 alloc = &allocs[stats->max_allocs]; 181 ++stats->max_allocs; 182 } else { 183 return NULL; // OOM. 184 } 185 186 alloc->start = start; 187 alloc->end = end; 188 alloc->st = st; 189 alloc->flags = flags; 190 alloc->next_free = NULL; 191 RB_INSERT(HeapEntriesTree, &allocs_tree, alloc); 192 ++stats->num_allocs; 193 return alloc; 194 } 195 196 // Deletes all the allocs in the range [addr, addr+size[ dealing with partial 197 // frees and hole punching. Note that in the general case this function might 198 // need to deal with very unfortunate cases, as below: 199 // 200 // Alloc tree begin: [Alloc 1]----[Alloc 2]-------[Alloc 3][Alloc 4]---[Alloc 5] 201 // Deletion range: [xxxxxxxxxxxxxxxxxxxx] 202 // Alloc tree end: [Alloc 1]----[Al.2]----------------------[Al.4]---[Alloc 5] 203 // Alloc3 has to be deleted and Alloc 2,4 shrunk. 204 static uint32_t delete_allocs_in_range(void* addr, size_t size) { 205 uintptr_t del_start = (uintptr_t) addr; 206 uintptr_t del_end = del_start + size - 1; 207 uint32_t flags = 0; 208 209 Alloc* alloc = NULL; 210 Alloc* next_alloc = RB_ROOT(&allocs_tree); 211 212 // Lookup the first (by address) relevant Alloc to initiate the deletion walk. 213 // At the end of the loop next_alloc is either: 214 // - the closest alloc starting before (or exactly at) the start of the 215 // deletion range (i.e. addr == del_start). 216 // - the first alloc inside the deletion range. 217 // - the first alloc after the deletion range iff the range was already empty 218 // (in this case the next loop will just bail out doing nothing). 219 // - NULL: iff the entire tree is empty (as above). 220 while (next_alloc != NULL) { 221 alloc = next_alloc; 222 if (alloc->start > del_start) { 223 next_alloc = RB_LEFT(alloc, rb_node); 224 } else if (alloc->end < del_start) { 225 next_alloc = RB_RIGHT(alloc, rb_node); 226 } else { // alloc->start <= del_start && alloc->end >= del_start 227 break; 228 } 229 } 230 231 // Now scan the allocs linearly deleting chunks (or eventually whole allocs) 232 // until passing the end of the deleting region. 233 next_alloc = alloc; 234 while (next_alloc != NULL) { 235 alloc = next_alloc; 236 next_alloc = RB_NEXT(HeapEntriesTree, &allocs_tree, alloc); 237 238 if (size != 0) { 239 // In the general case we stop passed the end of the deletion range. 240 if (alloc->start > del_end) 241 break; 242 243 // This deals with the case of the first Alloc laying before the range. 244 if (alloc->end < del_start) 245 continue; 246 } else { 247 // size == 0 is a special case. It means deleting only the alloc which 248 // starts exactly at |del_start| if any (for dealing with free(ptr)). 249 if (alloc->start > del_start) 250 break; 251 if (alloc->start < del_start) 252 continue; 253 del_end = alloc->end; 254 } 255 256 // Reached this point the Alloc must overlap (partially or completely) with 257 // the deletion range. 258 assert(!(alloc->start > del_end || alloc->end < del_start)); 259 260 StacktraceEntry* st = alloc->st; 261 flags |= alloc->flags; 262 uintptr_t freed_bytes = 0; // Bytes freed in this cycle. 263 264 if (del_start <= alloc->start) { 265 if (del_end >= alloc->end) { 266 // Complete overlap. Delete full Alloc. Note: the range might might 267 // still overlap with the next allocs. 268 // Begin: ------[alloc.start alloc.end]-[next alloc] 269 // Del range: [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx] 270 // Result: ---------------------------------[next alloc] 271 // [next alloc] will be shrinked on the next iteration. 272 freed_bytes = alloc->end - alloc->start + 1; 273 RB_REMOVE(HeapEntriesTree, &allocs_tree, alloc); 274 275 // Clean-up, so heap_dump can tell this is a free entry and skip it. 276 alloc->start = alloc->end = 0; 277 alloc->st = NULL; 278 279 // Put in the freelist. 280 alloc->next_free = allocs_freelist; 281 allocs_freelist = alloc; 282 --stats->num_allocs; 283 } else { 284 // Partial overlap at beginning. Cut first part and shrink the alloc. 285 // Begin: ------[alloc.start alloc.end]-[next alloc] 286 // Del range: [xxxxxx] 287 // Result: ------------[start alloc.end]-[next alloc] 288 freed_bytes = del_end - alloc->start + 1; 289 alloc->start = del_end + 1; 290 // No need to update the tree even if we changed the key. The keys are 291 // still monotonic (because the ranges are guaranteed to not overlap). 292 } 293 } else { 294 if (del_end >= alloc->end) { 295 // Partial overlap at end. Cut last part and shrink the alloc left. 296 // Begin: ------[alloc.start alloc.end]-[next alloc] 297 // Del range: [xxxxxxxx] 298 // Result: ------[alloc.start alloc.end]-----[next alloc] 299 // [next alloc] will be shrinked on the next iteration. 300 freed_bytes = alloc->end - del_start + 1; 301 alloc->end = del_start - 1; 302 } else { 303 // Hole punching. Requires creating an extra alloc. 304 // Begin: ------[alloc.start alloc.end]-[next alloc] 305 // Del range: [xxx] 306 // Result: ------[ alloc 1 ]-----[ alloc 2 ]-[next alloc] 307 freed_bytes = del_end - del_start + 1; 308 const uintptr_t old_end = alloc->end; 309 alloc->end = del_start - 1; 310 311 // In case of OOM, don't count the 2nd alloc we failed to allocate. 312 if (insert_alloc(del_end + 1, old_end, st, alloc->flags) == NULL) 313 freed_bytes += (old_end - del_end); 314 } 315 } 316 // Now update the StackTraceEntry the Alloc was pointing to, eventually 317 // freeing it up. 318 assert(st->alloc_bytes >= freed_bytes); 319 st->alloc_bytes -= freed_bytes; 320 if (st->alloc_bytes == 0) 321 free_stacktrace(st); 322 stats->total_alloc_bytes -= freed_bytes; 323 } 324 return flags; 325 } 326 327 // +---------------------------------------------------------------------------+ 328 // + Library entry points (refer to heap_profiler.h for API doc). + 329 // +---------------------------------------------------------------------------+ 330 void heap_profiler_free(void* addr, size_t size, uint32_t* old_flags) { 331 assert(size == 0 || ((uintptr_t) addr + (size - 1)) >= (uintptr_t) addr); 332 333 LOCK_MUTEX(lock); 334 uint32_t flags = delete_allocs_in_range(addr, size); 335 UNLOCK_MUTEX(lock); 336 337 if (old_flags != NULL) 338 *old_flags = flags; 339 } 340 341 void heap_profiler_alloc(void* addr, size_t size, uintptr_t* frames, 342 uint32_t depth, uint32_t flags) { 343 if (depth > HEAP_PROFILER_MAX_DEPTH) 344 depth = HEAP_PROFILER_MAX_DEPTH; 345 346 if (size == 0) // Apps calling malloc(0), sometimes it happens. 347 return; 348 349 const uintptr_t start = (uintptr_t) addr; 350 const uintptr_t end = start + (size - 1); 351 assert(start <= end); 352 353 LOCK_MUTEX(lock); 354 355 delete_allocs_in_range(addr, size); 356 357 StacktraceEntry* st = record_stacktrace(frames, depth); 358 if (st != NULL) { 359 Alloc* alloc = insert_alloc(start, end, st, flags); 360 if (alloc != NULL) { 361 st->alloc_bytes += size; 362 stats->total_alloc_bytes += size; 363 } 364 } 365 366 UNLOCK_MUTEX(lock); 367 } 368 369 void heap_profiler_init(HeapStats* heap_stats) { 370 LOCK_MUTEX(lock); 371 372 assert(stats == NULL); 373 stats = heap_stats; 374 memset(stats, 0, sizeof(HeapStats)); 375 stats->magic_start = HEAP_PROFILER_MAGIC_MARKER; 376 stats->allocs = &allocs[0]; 377 stats->stack_traces = &stack_traces[0]; 378 379 UNLOCK_MUTEX(lock); 380 } 381 382 void heap_profiler_cleanup(void) { 383 LOCK_MUTEX(lock); 384 385 assert(stats != NULL); 386 memset(stack_traces, 0, sizeof(StacktraceEntry) * stats->max_stack_traces); 387 memset(stack_traces_ht, 0, sizeof(stack_traces_ht)); 388 stack_traces_freelist = NULL; 389 390 memset(allocs, 0, sizeof(Alloc) * stats->max_allocs); 391 allocs_freelist = NULL; 392 RB_INIT(&allocs_tree); 393 394 stats = NULL; 395 396 UNLOCK_MUTEX(lock); 397 } 398