1 /******************************************************************************/ 2 #ifdef JEMALLOC_H_TYPES 3 4 typedef struct prof_bt_s prof_bt_t; 5 typedef struct prof_cnt_s prof_cnt_t; 6 typedef struct prof_thr_cnt_s prof_thr_cnt_t; 7 typedef struct prof_ctx_s prof_ctx_t; 8 typedef struct prof_tdata_s prof_tdata_t; 9 10 /* Option defaults. */ 11 #ifdef JEMALLOC_PROF 12 # define PROF_PREFIX_DEFAULT "jeprof" 13 #else 14 # define PROF_PREFIX_DEFAULT "" 15 #endif 16 #define LG_PROF_SAMPLE_DEFAULT 19 17 #define LG_PROF_INTERVAL_DEFAULT -1 18 19 /* 20 * Hard limit on stack backtrace depth. The version of prof_backtrace() that 21 * is based on __builtin_return_address() necessarily has a hard-coded number 22 * of backtrace frame handlers, and should be kept in sync with this setting. 23 */ 24 #define PROF_BT_MAX 128 25 26 /* Maximum number of backtraces to store in each per thread LRU cache. */ 27 #define PROF_TCMAX 1024 28 29 /* Initial hash table size. */ 30 #define PROF_CKH_MINITEMS 64 31 32 /* Size of memory buffer to use when writing dump files. */ 33 #define PROF_DUMP_BUFSIZE 65536 34 35 /* Size of stack-allocated buffer used by prof_printf(). */ 36 #define PROF_PRINTF_BUFSIZE 128 37 38 /* 39 * Number of mutexes shared among all ctx's. No space is allocated for these 40 * unless profiling is enabled, so it's okay to over-provision. 41 */ 42 #define PROF_NCTX_LOCKS 1024 43 44 /* 45 * prof_tdata pointers close to NULL are used to encode state information that 46 * is used for cleaning up during thread shutdown. 47 */ 48 #define PROF_TDATA_STATE_REINCARNATED ((prof_tdata_t *)(uintptr_t)1) 49 #define PROF_TDATA_STATE_PURGATORY ((prof_tdata_t *)(uintptr_t)2) 50 #define PROF_TDATA_STATE_MAX PROF_TDATA_STATE_PURGATORY 51 52 #endif /* JEMALLOC_H_TYPES */ 53 /******************************************************************************/ 54 #ifdef JEMALLOC_H_STRUCTS 55 56 struct prof_bt_s { 57 /* Backtrace, stored as len program counters. */ 58 void **vec; 59 unsigned len; 60 }; 61 62 #ifdef JEMALLOC_PROF_LIBGCC 63 /* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */ 64 typedef struct { 65 prof_bt_t *bt; 66 unsigned max; 67 } prof_unwind_data_t; 68 #endif 69 70 struct prof_cnt_s { 71 /* 72 * Profiling counters. An allocation/deallocation pair can operate on 73 * different prof_thr_cnt_t objects that are linked into the same 74 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go 75 * negative. In principle it is possible for the *bytes counters to 76 * overflow/underflow, but a general solution would require something 77 * like 128-bit counters; this implementation doesn't bother to solve 78 * that problem. 79 */ 80 int64_t curobjs; 81 int64_t curbytes; 82 uint64_t accumobjs; 83 uint64_t accumbytes; 84 }; 85 86 struct prof_thr_cnt_s { 87 /* Linkage into prof_ctx_t's cnts_ql. */ 88 ql_elm(prof_thr_cnt_t) cnts_link; 89 90 /* Linkage into thread's LRU. */ 91 ql_elm(prof_thr_cnt_t) lru_link; 92 93 /* 94 * Associated context. If a thread frees an object that it did not 95 * allocate, it is possible that the context is not cached in the 96 * thread's hash table, in which case it must be able to look up the 97 * context, insert a new prof_thr_cnt_t into the thread's hash table, 98 * and link it into the prof_ctx_t's cnts_ql. 99 */ 100 prof_ctx_t *ctx; 101 102 /* 103 * Threads use memory barriers to update the counters. Since there is 104 * only ever one writer, the only challenge is for the reader to get a 105 * consistent read of the counters. 106 * 107 * The writer uses this series of operations: 108 * 109 * 1) Increment epoch to an odd number. 110 * 2) Update counters. 111 * 3) Increment epoch to an even number. 112 * 113 * The reader must assure 1) that the epoch is even while it reads the 114 * counters, and 2) that the epoch doesn't change between the time it 115 * starts and finishes reading the counters. 116 */ 117 unsigned epoch; 118 119 /* Profiling counters. */ 120 prof_cnt_t cnts; 121 }; 122 123 struct prof_ctx_s { 124 /* Associated backtrace. */ 125 prof_bt_t *bt; 126 127 /* Protects nlimbo, cnt_merged, and cnts_ql. */ 128 malloc_mutex_t *lock; 129 130 /* 131 * Number of threads that currently cause this ctx to be in a state of 132 * limbo due to one of: 133 * - Initializing per thread counters associated with this ctx. 134 * - Preparing to destroy this ctx. 135 * - Dumping a heap profile that includes this ctx. 136 * nlimbo must be 1 (single destroyer) in order to safely destroy the 137 * ctx. 138 */ 139 unsigned nlimbo; 140 141 /* Temporary storage for summation during dump. */ 142 prof_cnt_t cnt_summed; 143 144 /* When threads exit, they merge their stats into cnt_merged. */ 145 prof_cnt_t cnt_merged; 146 147 /* 148 * List of profile counters, one for each thread that has allocated in 149 * this context. 150 */ 151 ql_head(prof_thr_cnt_t) cnts_ql; 152 153 /* Linkage for list of contexts to be dumped. */ 154 ql_elm(prof_ctx_t) dump_link; 155 }; 156 typedef ql_head(prof_ctx_t) prof_ctx_list_t; 157 158 struct prof_tdata_s { 159 /* 160 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *). Each thread keeps a 161 * cache of backtraces, with associated thread-specific prof_thr_cnt_t 162 * objects. Other threads may read the prof_thr_cnt_t contents, but no 163 * others will ever write them. 164 * 165 * Upon thread exit, the thread must merge all the prof_thr_cnt_t 166 * counter data into the associated prof_ctx_t objects, and unlink/free 167 * the prof_thr_cnt_t objects. 168 */ 169 ckh_t bt2cnt; 170 171 /* LRU for contents of bt2cnt. */ 172 ql_head(prof_thr_cnt_t) lru_ql; 173 174 /* Backtrace vector, used for calls to prof_backtrace(). */ 175 void **vec; 176 177 /* Sampling state. */ 178 uint64_t prng_state; 179 uint64_t bytes_until_sample; 180 181 /* State used to avoid dumping while operating on prof internals. */ 182 bool enq; 183 bool enq_idump; 184 bool enq_gdump; 185 }; 186 187 #endif /* JEMALLOC_H_STRUCTS */ 188 /******************************************************************************/ 189 #ifdef JEMALLOC_H_EXTERNS 190 191 extern bool opt_prof; 192 /* 193 * Even if opt_prof is true, sampling can be temporarily disabled by setting 194 * opt_prof_active to false. No locking is used when updating opt_prof_active, 195 * so there are no guarantees regarding how long it will take for all threads 196 * to notice state changes. 197 */ 198 extern bool opt_prof_active; 199 extern size_t opt_lg_prof_sample; /* Mean bytes between samples. */ 200 extern ssize_t opt_lg_prof_interval; /* lg(prof_interval). */ 201 extern bool opt_prof_gdump; /* High-water memory dumping. */ 202 extern bool opt_prof_final; /* Final profile dumping. */ 203 extern bool opt_prof_leak; /* Dump leak summary at exit. */ 204 extern bool opt_prof_accum; /* Report cumulative bytes. */ 205 extern char opt_prof_prefix[ 206 /* Minimize memory bloat for non-prof builds. */ 207 #ifdef JEMALLOC_PROF 208 PATH_MAX + 209 #endif 210 1]; 211 212 /* 213 * Profile dump interval, measured in bytes allocated. Each arena triggers a 214 * profile dump when it reaches this threshold. The effect is that the 215 * interval between profile dumps averages prof_interval, though the actual 216 * interval between dumps will tend to be sporadic, and the interval will be a 217 * maximum of approximately (prof_interval * narenas). 218 */ 219 extern uint64_t prof_interval; 220 221 void bt_init(prof_bt_t *bt, void **vec); 222 void prof_backtrace(prof_bt_t *bt); 223 prof_thr_cnt_t *prof_lookup(prof_bt_t *bt); 224 #ifdef JEMALLOC_JET 225 size_t prof_bt_count(void); 226 typedef int (prof_dump_open_t)(bool, const char *); 227 extern prof_dump_open_t *prof_dump_open; 228 #endif 229 void prof_idump(void); 230 bool prof_mdump(const char *filename); 231 void prof_gdump(void); 232 prof_tdata_t *prof_tdata_init(void); 233 void prof_tdata_cleanup(void *arg); 234 void prof_boot0(void); 235 void prof_boot1(void); 236 bool prof_boot2(void); 237 void prof_prefork(void); 238 void prof_postfork_parent(void); 239 void prof_postfork_child(void); 240 void prof_sample_threshold_update(prof_tdata_t *prof_tdata); 241 242 #endif /* JEMALLOC_H_EXTERNS */ 243 /******************************************************************************/ 244 #ifdef JEMALLOC_H_INLINES 245 246 #define PROF_ALLOC_PREP(size, ret) do { \ 247 prof_tdata_t *prof_tdata; \ 248 prof_bt_t bt; \ 249 \ 250 assert(size == s2u(size)); \ 251 \ 252 if (!opt_prof_active || \ 253 prof_sample_accum_update(size, false, &prof_tdata)) { \ 254 ret = (prof_thr_cnt_t *)(uintptr_t)1U; \ 255 } else { \ 256 bt_init(&bt, prof_tdata->vec); \ 257 prof_backtrace(&bt); \ 258 ret = prof_lookup(&bt); \ 259 } \ 260 } while (0) 261 262 #ifndef JEMALLOC_ENABLE_INLINE 263 malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *) 264 265 prof_tdata_t *prof_tdata_get(bool create); 266 bool prof_sample_accum_update(size_t size, bool commit, 267 prof_tdata_t **prof_tdata_out); 268 prof_ctx_t *prof_ctx_get(const void *ptr); 269 void prof_ctx_set(const void *ptr, prof_ctx_t *ctx); 270 void prof_malloc_record_object(const void *ptr, size_t usize, 271 prof_thr_cnt_t *cnt); 272 void prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt); 273 void prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 274 size_t old_usize, prof_ctx_t *old_ctx); 275 void prof_free(const void *ptr, size_t size); 276 #endif 277 278 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_)) 279 /* Thread-specific backtrace cache, used to reduce bt2ctx contention. */ 280 malloc_tsd_externs(prof_tdata, prof_tdata_t *) 281 malloc_tsd_funcs(JEMALLOC_INLINE, prof_tdata, prof_tdata_t *, NULL, 282 prof_tdata_cleanup) 283 284 JEMALLOC_INLINE prof_tdata_t * 285 prof_tdata_get(bool create) 286 { 287 prof_tdata_t *prof_tdata; 288 289 cassert(config_prof); 290 291 prof_tdata = *prof_tdata_tsd_get(); 292 if (create && prof_tdata == NULL) 293 prof_tdata = prof_tdata_init(); 294 295 return (prof_tdata); 296 } 297 298 JEMALLOC_INLINE prof_ctx_t * 299 prof_ctx_get(const void *ptr) 300 { 301 prof_ctx_t *ret; 302 arena_chunk_t *chunk; 303 304 cassert(config_prof); 305 assert(ptr != NULL); 306 307 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 308 if (chunk != ptr) { 309 /* Region. */ 310 ret = arena_prof_ctx_get(ptr); 311 } else 312 ret = huge_prof_ctx_get(ptr); 313 314 return (ret); 315 } 316 317 JEMALLOC_INLINE void 318 prof_ctx_set(const void *ptr, prof_ctx_t *ctx) 319 { 320 arena_chunk_t *chunk; 321 322 cassert(config_prof); 323 assert(ptr != NULL); 324 325 chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); 326 if (chunk != ptr) { 327 /* Region. */ 328 arena_prof_ctx_set(ptr, ctx); 329 } else 330 huge_prof_ctx_set(ptr, ctx); 331 } 332 333 JEMALLOC_INLINE bool 334 prof_sample_accum_update(size_t size, bool commit, 335 prof_tdata_t **prof_tdata_out) 336 { 337 prof_tdata_t *prof_tdata; 338 339 cassert(config_prof); 340 341 prof_tdata = prof_tdata_get(true); 342 if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX) 343 prof_tdata = NULL; 344 345 if (prof_tdata_out != NULL) 346 *prof_tdata_out = prof_tdata; 347 348 if (prof_tdata == NULL) 349 return (true); 350 351 if (prof_tdata->bytes_until_sample >= size) { 352 if (commit) 353 prof_tdata->bytes_until_sample -= size; 354 return (true); 355 } else { 356 /* Compute new sample threshold. */ 357 if (commit) 358 prof_sample_threshold_update(prof_tdata); 359 return (false); 360 } 361 } 362 363 JEMALLOC_INLINE void 364 prof_malloc_record_object(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) { 365 prof_ctx_set(ptr, cnt->ctx); 366 367 cnt->epoch++; 368 /*********/ 369 mb_write(); 370 /*********/ 371 cnt->cnts.curobjs++; 372 cnt->cnts.curbytes += usize; 373 if (opt_prof_accum) { 374 cnt->cnts.accumobjs++; 375 cnt->cnts.accumbytes += usize; 376 } 377 /*********/ 378 mb_write(); 379 /*********/ 380 cnt->epoch++; 381 /*********/ 382 mb_write(); 383 /*********/ 384 } 385 386 JEMALLOC_INLINE void 387 prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt) 388 { 389 390 cassert(config_prof); 391 assert(ptr != NULL); 392 assert(usize == isalloc(ptr, true)); 393 394 if (prof_sample_accum_update(usize, true, NULL)) { 395 /* 396 * Don't sample. For malloc()-like allocation, it is 397 * always possible to tell in advance how large an 398 * object's usable size will be, so there should never 399 * be a difference between the usize passed to 400 * PROF_ALLOC_PREP() and prof_malloc(). 401 */ 402 assert((uintptr_t)cnt == (uintptr_t)1U); 403 } 404 405 if ((uintptr_t)cnt > (uintptr_t)1U) 406 prof_malloc_record_object(ptr, usize, cnt); 407 else 408 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 409 } 410 411 JEMALLOC_INLINE void 412 prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt, 413 size_t old_usize, prof_ctx_t *old_ctx) 414 { 415 prof_thr_cnt_t *told_cnt; 416 417 cassert(config_prof); 418 assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U); 419 420 if (ptr != NULL) { 421 assert(usize == isalloc(ptr, true)); 422 if (prof_sample_accum_update(usize, true, NULL)) { 423 /* 424 * Don't sample. The usize passed to 425 * PROF_ALLOC_PREP() was larger than what 426 * actually got allocated, so a backtrace was 427 * captured for this allocation, even though 428 * its actual usize was insufficient to cross 429 * the sample threshold. 430 */ 431 cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 432 } 433 } 434 435 if ((uintptr_t)old_ctx > (uintptr_t)1U) { 436 told_cnt = prof_lookup(old_ctx->bt); 437 if (told_cnt == NULL) { 438 /* 439 * It's too late to propagate OOM for this realloc(), 440 * so operate directly on old_cnt->ctx->cnt_merged. 441 */ 442 malloc_mutex_lock(old_ctx->lock); 443 old_ctx->cnt_merged.curobjs--; 444 old_ctx->cnt_merged.curbytes -= old_usize; 445 malloc_mutex_unlock(old_ctx->lock); 446 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 447 } 448 } else 449 told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U; 450 451 if ((uintptr_t)told_cnt > (uintptr_t)1U) 452 told_cnt->epoch++; 453 if ((uintptr_t)cnt > (uintptr_t)1U) { 454 prof_ctx_set(ptr, cnt->ctx); 455 cnt->epoch++; 456 } else if (ptr != NULL) 457 prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U); 458 /*********/ 459 mb_write(); 460 /*********/ 461 if ((uintptr_t)told_cnt > (uintptr_t)1U) { 462 told_cnt->cnts.curobjs--; 463 told_cnt->cnts.curbytes -= old_usize; 464 } 465 if ((uintptr_t)cnt > (uintptr_t)1U) { 466 cnt->cnts.curobjs++; 467 cnt->cnts.curbytes += usize; 468 if (opt_prof_accum) { 469 cnt->cnts.accumobjs++; 470 cnt->cnts.accumbytes += usize; 471 } 472 } 473 /*********/ 474 mb_write(); 475 /*********/ 476 if ((uintptr_t)told_cnt > (uintptr_t)1U) 477 told_cnt->epoch++; 478 if ((uintptr_t)cnt > (uintptr_t)1U) 479 cnt->epoch++; 480 /*********/ 481 mb_write(); /* Not strictly necessary. */ 482 } 483 484 JEMALLOC_INLINE void 485 prof_free(const void *ptr, size_t size) 486 { 487 prof_ctx_t *ctx = prof_ctx_get(ptr); 488 489 cassert(config_prof); 490 491 if ((uintptr_t)ctx > (uintptr_t)1) { 492 prof_thr_cnt_t *tcnt; 493 assert(size == isalloc(ptr, true)); 494 tcnt = prof_lookup(ctx->bt); 495 496 if (tcnt != NULL) { 497 tcnt->epoch++; 498 /*********/ 499 mb_write(); 500 /*********/ 501 tcnt->cnts.curobjs--; 502 tcnt->cnts.curbytes -= size; 503 /*********/ 504 mb_write(); 505 /*********/ 506 tcnt->epoch++; 507 /*********/ 508 mb_write(); 509 /*********/ 510 } else { 511 /* 512 * OOM during free() cannot be propagated, so operate 513 * directly on cnt->ctx->cnt_merged. 514 */ 515 malloc_mutex_lock(ctx->lock); 516 ctx->cnt_merged.curobjs--; 517 ctx->cnt_merged.curbytes -= size; 518 malloc_mutex_unlock(ctx->lock); 519 } 520 } 521 } 522 #endif 523 524 #endif /* JEMALLOC_H_INLINES */ 525 /******************************************************************************/ 526