1 /* 2 * Copyright 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24 #include <stdint.h> 25 #include <stdlib.h> 26 #include <unistd.h> 27 #include <limits.h> 28 #include <assert.h> 29 #include <linux/futex.h> 30 #include <linux/memfd.h> 31 #include <sys/time.h> 32 #include <sys/mman.h> 33 #include <sys/syscall.h> 34 35 #include "anv_private.h" 36 37 #ifdef HAVE_VALGRIND 38 #define VG_NOACCESS_READ(__ptr) ({ \ 39 VALGRIND_MAKE_MEM_DEFINED((__ptr), sizeof(*(__ptr))); \ 40 __typeof(*(__ptr)) __val = *(__ptr); \ 41 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr)));\ 42 __val; \ 43 }) 44 #define VG_NOACCESS_WRITE(__ptr, __val) ({ \ 45 VALGRIND_MAKE_MEM_UNDEFINED((__ptr), sizeof(*(__ptr))); \ 46 *(__ptr) = (__val); \ 47 VALGRIND_MAKE_MEM_NOACCESS((__ptr), sizeof(*(__ptr))); \ 48 }) 49 #else 50 #define VG_NOACCESS_READ(__ptr) (*(__ptr)) 51 #define VG_NOACCESS_WRITE(__ptr, __val) (*(__ptr) = (__val)) 52 #endif 53 54 /* Design goals: 55 * 56 * - Lock free (except when resizing underlying bos) 57 * 58 * - Constant time allocation with typically only one atomic 59 * 60 * - Multiple allocation sizes without fragmentation 61 * 62 * - Can grow while keeping addresses and offset of contents stable 63 * 64 * - All allocations within one bo so we can point one of the 65 * STATE_BASE_ADDRESS pointers at it. 66 * 67 * The overall design is a two-level allocator: top level is a fixed size, big 68 * block (8k) allocator, which operates out of a bo. Allocation is done by 69 * either pulling a block from the free list or growing the used range of the 70 * bo. Growing the range may run out of space in the bo which we then need to 71 * grow. Growing the bo is tricky in a multi-threaded, lockless environment: 72 * we need to keep all pointers and contents in the old map valid. GEM bos in 73 * general can't grow, but we use a trick: we create a memfd and use ftruncate 74 * to grow it as necessary. We mmap the new size and then create a gem bo for 75 * it using the new gem userptr ioctl. Without heavy-handed locking around 76 * our allocation fast-path, there isn't really a way to munmap the old mmap, 77 * so we just keep it around until garbage collection time. While the block 78 * allocator is lockless for normal operations, we block other threads trying 79 * to allocate while we're growing the map. It sholdn't happen often, and 80 * growing is fast anyway. 81 * 82 * At the next level we can use various sub-allocators. The state pool is a 83 * pool of smaller, fixed size objects, which operates much like the block 84 * pool. It uses a free list for freeing objects, but when it runs out of 85 * space it just allocates a new block from the block pool. This allocator is 86 * intended for longer lived state objects such as SURFACE_STATE and most 87 * other persistent state objects in the API. We may need to track more info 88 * with these object and a pointer back to the CPU object (eg VkImage). In 89 * those cases we just allocate a slightly bigger object and put the extra 90 * state after the GPU state object. 91 * 92 * The state stream allocator works similar to how the i965 DRI driver streams 93 * all its state. Even with Vulkan, we need to emit transient state (whether 94 * surface state base or dynamic state base), and for that we can just get a 95 * block and fill it up. These cases are local to a command buffer and the 96 * sub-allocator need not be thread safe. The streaming allocator gets a new 97 * block when it runs out of space and chains them together so they can be 98 * easily freed. 99 */ 100 101 /* Allocations are always at least 64 byte aligned, so 1 is an invalid value. 102 * We use it to indicate the free list is empty. */ 103 #define EMPTY 1 104 105 struct anv_mmap_cleanup { 106 void *map; 107 size_t size; 108 uint32_t gem_handle; 109 }; 110 111 #define ANV_MMAP_CLEANUP_INIT ((struct anv_mmap_cleanup){0}) 112 113 static inline long 114 sys_futex(void *addr1, int op, int val1, 115 struct timespec *timeout, void *addr2, int val3) 116 { 117 return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3); 118 } 119 120 static inline int 121 futex_wake(uint32_t *addr, int count) 122 { 123 return sys_futex(addr, FUTEX_WAKE, count, NULL, NULL, 0); 124 } 125 126 static inline int 127 futex_wait(uint32_t *addr, int32_t value) 128 { 129 return sys_futex(addr, FUTEX_WAIT, value, NULL, NULL, 0); 130 } 131 132 static inline int 133 memfd_create(const char *name, unsigned int flags) 134 { 135 return syscall(SYS_memfd_create, name, flags); 136 } 137 138 static inline uint32_t 139 ilog2_round_up(uint32_t value) 140 { 141 assert(value != 0); 142 return 32 - __builtin_clz(value - 1); 143 } 144 145 static inline uint32_t 146 round_to_power_of_two(uint32_t value) 147 { 148 return 1 << ilog2_round_up(value); 149 } 150 151 static bool 152 anv_free_list_pop(union anv_free_list *list, void **map, int32_t *offset) 153 { 154 union anv_free_list current, new, old; 155 156 current.u64 = list->u64; 157 while (current.offset != EMPTY) { 158 /* We have to add a memory barrier here so that the list head (and 159 * offset) gets read before we read the map pointer. This way we 160 * know that the map pointer is valid for the given offset at the 161 * point where we read it. 162 */ 163 __sync_synchronize(); 164 165 int32_t *next_ptr = *map + current.offset; 166 new.offset = VG_NOACCESS_READ(next_ptr); 167 new.count = current.count + 1; 168 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); 169 if (old.u64 == current.u64) { 170 *offset = current.offset; 171 return true; 172 } 173 current = old; 174 } 175 176 return false; 177 } 178 179 static void 180 anv_free_list_push(union anv_free_list *list, void *map, int32_t offset) 181 { 182 union anv_free_list current, old, new; 183 int32_t *next_ptr = map + offset; 184 185 old = *list; 186 do { 187 current = old; 188 VG_NOACCESS_WRITE(next_ptr, current.offset); 189 new.offset = offset; 190 new.count = current.count + 1; 191 old.u64 = __sync_val_compare_and_swap(&list->u64, current.u64, new.u64); 192 } while (old.u64 != current.u64); 193 } 194 195 /* All pointers in the ptr_free_list are assumed to be page-aligned. This 196 * means that the bottom 12 bits should all be zero. 197 */ 198 #define PFL_COUNT(x) ((uintptr_t)(x) & 0xfff) 199 #define PFL_PTR(x) ((void *)((uintptr_t)(x) & ~(uintptr_t)0xfff)) 200 #define PFL_PACK(ptr, count) ({ \ 201 (void *)(((uintptr_t)(ptr) & ~(uintptr_t)0xfff) | ((count) & 0xfff)); \ 202 }) 203 204 static bool 205 anv_ptr_free_list_pop(void **list, void **elem) 206 { 207 void *current = *list; 208 while (PFL_PTR(current) != NULL) { 209 void **next_ptr = PFL_PTR(current); 210 void *new_ptr = VG_NOACCESS_READ(next_ptr); 211 unsigned new_count = PFL_COUNT(current) + 1; 212 void *new = PFL_PACK(new_ptr, new_count); 213 void *old = __sync_val_compare_and_swap(list, current, new); 214 if (old == current) { 215 *elem = PFL_PTR(current); 216 return true; 217 } 218 current = old; 219 } 220 221 return false; 222 } 223 224 static void 225 anv_ptr_free_list_push(void **list, void *elem) 226 { 227 void *old, *current; 228 void **next_ptr = elem; 229 230 /* The pointer-based free list requires that the pointer be 231 * page-aligned. This is because we use the bottom 12 bits of the 232 * pointer to store a counter to solve the ABA concurrency problem. 233 */ 234 assert(((uintptr_t)elem & 0xfff) == 0); 235 236 old = *list; 237 do { 238 current = old; 239 VG_NOACCESS_WRITE(next_ptr, PFL_PTR(current)); 240 unsigned new_count = PFL_COUNT(current) + 1; 241 void *new = PFL_PACK(elem, new_count); 242 old = __sync_val_compare_and_swap(list, current, new); 243 } while (old != current); 244 } 245 246 static uint32_t 247 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state); 248 249 VkResult 250 anv_block_pool_init(struct anv_block_pool *pool, 251 struct anv_device *device, uint32_t block_size) 252 { 253 VkResult result; 254 255 assert(util_is_power_of_two(block_size)); 256 257 pool->device = device; 258 anv_bo_init(&pool->bo, 0, 0); 259 pool->block_size = block_size; 260 pool->free_list = ANV_FREE_LIST_EMPTY; 261 pool->back_free_list = ANV_FREE_LIST_EMPTY; 262 263 pool->fd = memfd_create("block pool", MFD_CLOEXEC); 264 if (pool->fd == -1) 265 return vk_error(VK_ERROR_INITIALIZATION_FAILED); 266 267 /* Just make it 2GB up-front. The Linux kernel won't actually back it 268 * with pages until we either map and fault on one of them or we use 269 * userptr and send a chunk of it off to the GPU. 270 */ 271 if (ftruncate(pool->fd, BLOCK_POOL_MEMFD_SIZE) == -1) { 272 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 273 goto fail_fd; 274 } 275 276 if (!u_vector_init(&pool->mmap_cleanups, 277 round_to_power_of_two(sizeof(struct anv_mmap_cleanup)), 278 128)) { 279 result = vk_error(VK_ERROR_INITIALIZATION_FAILED); 280 goto fail_fd; 281 } 282 283 pool->state.next = 0; 284 pool->state.end = 0; 285 pool->back_state.next = 0; 286 pool->back_state.end = 0; 287 288 /* Immediately grow the pool so we'll have a backing bo. */ 289 pool->state.end = anv_block_pool_grow(pool, &pool->state); 290 291 return VK_SUCCESS; 292 293 fail_fd: 294 close(pool->fd); 295 296 return result; 297 } 298 299 void 300 anv_block_pool_finish(struct anv_block_pool *pool) 301 { 302 struct anv_mmap_cleanup *cleanup; 303 304 u_vector_foreach(cleanup, &pool->mmap_cleanups) { 305 if (cleanup->map) 306 munmap(cleanup->map, cleanup->size); 307 if (cleanup->gem_handle) 308 anv_gem_close(pool->device, cleanup->gem_handle); 309 } 310 311 u_vector_finish(&pool->mmap_cleanups); 312 313 close(pool->fd); 314 } 315 316 #define PAGE_SIZE 4096 317 318 /** Grows and re-centers the block pool. 319 * 320 * We grow the block pool in one or both directions in such a way that the 321 * following conditions are met: 322 * 323 * 1) The size of the entire pool is always a power of two. 324 * 325 * 2) The pool only grows on both ends. Neither end can get 326 * shortened. 327 * 328 * 3) At the end of the allocation, we have about twice as much space 329 * allocated for each end as we have used. This way the pool doesn't 330 * grow too far in one direction or the other. 331 * 332 * 4) If the _alloc_back() has never been called, then the back portion of 333 * the pool retains a size of zero. (This makes it easier for users of 334 * the block pool that only want a one-sided pool.) 335 * 336 * 5) We have enough space allocated for at least one more block in 337 * whichever side `state` points to. 338 * 339 * 6) The center of the pool is always aligned to both the block_size of 340 * the pool and a 4K CPU page. 341 */ 342 static uint32_t 343 anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state) 344 { 345 size_t size; 346 void *map; 347 uint32_t gem_handle; 348 struct anv_mmap_cleanup *cleanup; 349 350 pthread_mutex_lock(&pool->device->mutex); 351 352 assert(state == &pool->state || state == &pool->back_state); 353 354 /* Gather a little usage information on the pool. Since we may have 355 * threadsd waiting in queue to get some storage while we resize, it's 356 * actually possible that total_used will be larger than old_size. In 357 * particular, block_pool_alloc() increments state->next prior to 358 * calling block_pool_grow, so this ensures that we get enough space for 359 * which ever side tries to grow the pool. 360 * 361 * We align to a page size because it makes it easier to do our 362 * calculations later in such a way that we state page-aigned. 363 */ 364 uint32_t back_used = align_u32(pool->back_state.next, PAGE_SIZE); 365 uint32_t front_used = align_u32(pool->state.next, PAGE_SIZE); 366 uint32_t total_used = front_used + back_used; 367 368 assert(state == &pool->state || back_used > 0); 369 370 size_t old_size = pool->bo.size; 371 372 if (old_size != 0 && 373 back_used * 2 <= pool->center_bo_offset && 374 front_used * 2 <= (old_size - pool->center_bo_offset)) { 375 /* If we're in this case then this isn't the firsta allocation and we 376 * already have enough space on both sides to hold double what we 377 * have allocated. There's nothing for us to do. 378 */ 379 goto done; 380 } 381 382 if (old_size == 0) { 383 /* This is the first allocation */ 384 size = MAX2(32 * pool->block_size, PAGE_SIZE); 385 } else { 386 size = old_size * 2; 387 } 388 389 /* We can't have a block pool bigger than 1GB because we use signed 390 * 32-bit offsets in the free list and we don't want overflow. We 391 * should never need a block pool bigger than 1GB anyway. 392 */ 393 assert(size <= (1u << 31)); 394 395 /* We compute a new center_bo_offset such that, when we double the size 396 * of the pool, we maintain the ratio of how much is used by each side. 397 * This way things should remain more-or-less balanced. 398 */ 399 uint32_t center_bo_offset; 400 if (back_used == 0) { 401 /* If we're in this case then we have never called alloc_back(). In 402 * this case, we want keep the offset at 0 to make things as simple 403 * as possible for users that don't care about back allocations. 404 */ 405 center_bo_offset = 0; 406 } else { 407 /* Try to "center" the allocation based on how much is currently in 408 * use on each side of the center line. 409 */ 410 center_bo_offset = ((uint64_t)size * back_used) / total_used; 411 412 /* Align down to a multiple of both the block size and page size */ 413 uint32_t granularity = MAX2(pool->block_size, PAGE_SIZE); 414 assert(util_is_power_of_two(granularity)); 415 center_bo_offset &= ~(granularity - 1); 416 417 assert(center_bo_offset >= back_used); 418 419 /* Make sure we don't shrink the back end of the pool */ 420 if (center_bo_offset < pool->back_state.end) 421 center_bo_offset = pool->back_state.end; 422 423 /* Make sure that we don't shrink the front end of the pool */ 424 if (size - center_bo_offset < pool->state.end) 425 center_bo_offset = size - pool->state.end; 426 } 427 428 assert(center_bo_offset % pool->block_size == 0); 429 assert(center_bo_offset % PAGE_SIZE == 0); 430 431 /* Assert that we only ever grow the pool */ 432 assert(center_bo_offset >= pool->back_state.end); 433 assert(size - center_bo_offset >= pool->state.end); 434 435 cleanup = u_vector_add(&pool->mmap_cleanups); 436 if (!cleanup) 437 goto fail; 438 *cleanup = ANV_MMAP_CLEANUP_INIT; 439 440 /* Just leak the old map until we destroy the pool. We can't munmap it 441 * without races or imposing locking on the block allocate fast path. On 442 * the whole the leaked maps adds up to less than the size of the 443 * current map. MAP_POPULATE seems like the right thing to do, but we 444 * should try to get some numbers. 445 */ 446 map = mmap(NULL, size, PROT_READ | PROT_WRITE, 447 MAP_SHARED | MAP_POPULATE, pool->fd, 448 BLOCK_POOL_MEMFD_CENTER - center_bo_offset); 449 cleanup->map = map; 450 cleanup->size = size; 451 452 if (map == MAP_FAILED) 453 goto fail; 454 455 gem_handle = anv_gem_userptr(pool->device, map, size); 456 if (gem_handle == 0) 457 goto fail; 458 cleanup->gem_handle = gem_handle; 459 460 #if 0 461 /* Regular objects are created I915_CACHING_CACHED on LLC platforms and 462 * I915_CACHING_NONE on non-LLC platforms. However, userptr objects are 463 * always created as I915_CACHING_CACHED, which on non-LLC means 464 * snooped. That can be useful but comes with a bit of overheard. Since 465 * we're eplicitly clflushing and don't want the overhead we need to turn 466 * it off. */ 467 if (!pool->device->info.has_llc) { 468 anv_gem_set_caching(pool->device, gem_handle, I915_CACHING_NONE); 469 anv_gem_set_domain(pool->device, gem_handle, 470 I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); 471 } 472 #endif 473 474 /* Now that we successfull allocated everything, we can write the new 475 * values back into pool. */ 476 pool->map = map + center_bo_offset; 477 pool->center_bo_offset = center_bo_offset; 478 anv_bo_init(&pool->bo, gem_handle, size); 479 pool->bo.map = map; 480 481 done: 482 pthread_mutex_unlock(&pool->device->mutex); 483 484 /* Return the appropreate new size. This function never actually 485 * updates state->next. Instead, we let the caller do that because it 486 * needs to do so in order to maintain its concurrency model. 487 */ 488 if (state == &pool->state) { 489 return pool->bo.size - pool->center_bo_offset; 490 } else { 491 assert(pool->center_bo_offset > 0); 492 return pool->center_bo_offset; 493 } 494 495 fail: 496 pthread_mutex_unlock(&pool->device->mutex); 497 498 return 0; 499 } 500 501 static uint32_t 502 anv_block_pool_alloc_new(struct anv_block_pool *pool, 503 struct anv_block_state *pool_state) 504 { 505 struct anv_block_state state, old, new; 506 507 while (1) { 508 state.u64 = __sync_fetch_and_add(&pool_state->u64, pool->block_size); 509 if (state.next < state.end) { 510 assert(pool->map); 511 return state.next; 512 } else if (state.next == state.end) { 513 /* We allocated the first block outside the pool, we have to grow it. 514 * pool_state->next acts a mutex: threads who try to allocate now will 515 * get block indexes above the current limit and hit futex_wait 516 * below. */ 517 new.next = state.next + pool->block_size; 518 new.end = anv_block_pool_grow(pool, pool_state); 519 assert(new.end >= new.next && new.end % pool->block_size == 0); 520 old.u64 = __sync_lock_test_and_set(&pool_state->u64, new.u64); 521 if (old.next != state.next) 522 futex_wake(&pool_state->end, INT_MAX); 523 return state.next; 524 } else { 525 futex_wait(&pool_state->end, state.end); 526 continue; 527 } 528 } 529 } 530 531 int32_t 532 anv_block_pool_alloc(struct anv_block_pool *pool) 533 { 534 int32_t offset; 535 536 /* Try free list first. */ 537 if (anv_free_list_pop(&pool->free_list, &pool->map, &offset)) { 538 assert(offset >= 0); 539 assert(pool->map); 540 return offset; 541 } 542 543 return anv_block_pool_alloc_new(pool, &pool->state); 544 } 545 546 /* Allocates a block out of the back of the block pool. 547 * 548 * This will allocated a block earlier than the "start" of the block pool. 549 * The offsets returned from this function will be negative but will still 550 * be correct relative to the block pool's map pointer. 551 * 552 * If you ever use anv_block_pool_alloc_back, then you will have to do 553 * gymnastics with the block pool's BO when doing relocations. 554 */ 555 int32_t 556 anv_block_pool_alloc_back(struct anv_block_pool *pool) 557 { 558 int32_t offset; 559 560 /* Try free list first. */ 561 if (anv_free_list_pop(&pool->back_free_list, &pool->map, &offset)) { 562 assert(offset < 0); 563 assert(pool->map); 564 return offset; 565 } 566 567 offset = anv_block_pool_alloc_new(pool, &pool->back_state); 568 569 /* The offset we get out of anv_block_pool_alloc_new() is actually the 570 * number of bytes downwards from the middle to the end of the block. 571 * We need to turn it into a (negative) offset from the middle to the 572 * start of the block. 573 */ 574 assert(offset >= 0); 575 return -(offset + pool->block_size); 576 } 577 578 void 579 anv_block_pool_free(struct anv_block_pool *pool, int32_t offset) 580 { 581 if (offset < 0) { 582 anv_free_list_push(&pool->back_free_list, pool->map, offset); 583 } else { 584 anv_free_list_push(&pool->free_list, pool->map, offset); 585 } 586 } 587 588 static void 589 anv_fixed_size_state_pool_init(struct anv_fixed_size_state_pool *pool, 590 size_t state_size) 591 { 592 /* At least a cache line and must divide the block size. */ 593 assert(state_size >= 64 && util_is_power_of_two(state_size)); 594 595 pool->state_size = state_size; 596 pool->free_list = ANV_FREE_LIST_EMPTY; 597 pool->block.next = 0; 598 pool->block.end = 0; 599 } 600 601 static uint32_t 602 anv_fixed_size_state_pool_alloc(struct anv_fixed_size_state_pool *pool, 603 struct anv_block_pool *block_pool) 604 { 605 int32_t offset; 606 struct anv_block_state block, old, new; 607 608 /* Try free list first. */ 609 if (anv_free_list_pop(&pool->free_list, &block_pool->map, &offset)) { 610 assert(offset >= 0); 611 return offset; 612 } 613 614 /* If free list was empty (or somebody raced us and took the items) we 615 * allocate a new item from the end of the block */ 616 restart: 617 block.u64 = __sync_fetch_and_add(&pool->block.u64, pool->state_size); 618 619 if (block.next < block.end) { 620 return block.next; 621 } else if (block.next == block.end) { 622 offset = anv_block_pool_alloc(block_pool); 623 new.next = offset + pool->state_size; 624 new.end = offset + block_pool->block_size; 625 old.u64 = __sync_lock_test_and_set(&pool->block.u64, new.u64); 626 if (old.next != block.next) 627 futex_wake(&pool->block.end, INT_MAX); 628 return offset; 629 } else { 630 futex_wait(&pool->block.end, block.end); 631 goto restart; 632 } 633 } 634 635 static void 636 anv_fixed_size_state_pool_free(struct anv_fixed_size_state_pool *pool, 637 struct anv_block_pool *block_pool, 638 uint32_t offset) 639 { 640 anv_free_list_push(&pool->free_list, block_pool->map, offset); 641 } 642 643 void 644 anv_state_pool_init(struct anv_state_pool *pool, 645 struct anv_block_pool *block_pool) 646 { 647 pool->block_pool = block_pool; 648 for (unsigned i = 0; i < ANV_STATE_BUCKETS; i++) { 649 size_t size = 1 << (ANV_MIN_STATE_SIZE_LOG2 + i); 650 anv_fixed_size_state_pool_init(&pool->buckets[i], size); 651 } 652 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); 653 } 654 655 void 656 anv_state_pool_finish(struct anv_state_pool *pool) 657 { 658 VG(VALGRIND_DESTROY_MEMPOOL(pool)); 659 } 660 661 struct anv_state 662 anv_state_pool_alloc(struct anv_state_pool *pool, size_t size, size_t align) 663 { 664 unsigned size_log2 = ilog2_round_up(size < align ? align : size); 665 assert(size_log2 <= ANV_MAX_STATE_SIZE_LOG2); 666 if (size_log2 < ANV_MIN_STATE_SIZE_LOG2) 667 size_log2 = ANV_MIN_STATE_SIZE_LOG2; 668 unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2; 669 670 struct anv_state state; 671 state.alloc_size = 1 << size_log2; 672 state.offset = anv_fixed_size_state_pool_alloc(&pool->buckets[bucket], 673 pool->block_pool); 674 state.map = pool->block_pool->map + state.offset; 675 VG(VALGRIND_MEMPOOL_ALLOC(pool, state.map, size)); 676 return state; 677 } 678 679 void 680 anv_state_pool_free(struct anv_state_pool *pool, struct anv_state state) 681 { 682 assert(util_is_power_of_two(state.alloc_size)); 683 unsigned size_log2 = ilog2_round_up(state.alloc_size); 684 assert(size_log2 >= ANV_MIN_STATE_SIZE_LOG2 && 685 size_log2 <= ANV_MAX_STATE_SIZE_LOG2); 686 unsigned bucket = size_log2 - ANV_MIN_STATE_SIZE_LOG2; 687 688 VG(VALGRIND_MEMPOOL_FREE(pool, state.map)); 689 anv_fixed_size_state_pool_free(&pool->buckets[bucket], 690 pool->block_pool, state.offset); 691 } 692 693 #define NULL_BLOCK 1 694 struct anv_state_stream_block { 695 /* The next block */ 696 struct anv_state_stream_block *next; 697 698 /* The offset into the block pool at which this block starts */ 699 uint32_t offset; 700 701 #ifdef HAVE_VALGRIND 702 /* A pointer to the first user-allocated thing in this block. This is 703 * what valgrind sees as the start of the block. 704 */ 705 void *_vg_ptr; 706 #endif 707 }; 708 709 /* The state stream allocator is a one-shot, single threaded allocator for 710 * variable sized blocks. We use it for allocating dynamic state. 711 */ 712 void 713 anv_state_stream_init(struct anv_state_stream *stream, 714 struct anv_block_pool *block_pool) 715 { 716 stream->block_pool = block_pool; 717 stream->block = NULL; 718 719 /* Ensure that next + whatever > end. This way the first call to 720 * state_stream_alloc fetches a new block. 721 */ 722 stream->next = 1; 723 stream->end = 0; 724 725 VG(VALGRIND_CREATE_MEMPOOL(stream, 0, false)); 726 } 727 728 void 729 anv_state_stream_finish(struct anv_state_stream *stream) 730 { 731 VG(const uint32_t block_size = stream->block_pool->block_size); 732 733 struct anv_state_stream_block *next = stream->block; 734 while (next != NULL) { 735 VG(VALGRIND_MAKE_MEM_DEFINED(next, sizeof(*next))); 736 struct anv_state_stream_block sb = VG_NOACCESS_READ(next); 737 VG(VALGRIND_MEMPOOL_FREE(stream, sb._vg_ptr)); 738 VG(VALGRIND_MAKE_MEM_UNDEFINED(next, block_size)); 739 anv_block_pool_free(stream->block_pool, sb.offset); 740 next = sb.next; 741 } 742 743 VG(VALGRIND_DESTROY_MEMPOOL(stream)); 744 } 745 746 struct anv_state 747 anv_state_stream_alloc(struct anv_state_stream *stream, 748 uint32_t size, uint32_t alignment) 749 { 750 struct anv_state_stream_block *sb = stream->block; 751 752 struct anv_state state; 753 754 state.offset = align_u32(stream->next, alignment); 755 if (state.offset + size > stream->end) { 756 uint32_t block = anv_block_pool_alloc(stream->block_pool); 757 sb = stream->block_pool->map + block; 758 759 VG(VALGRIND_MAKE_MEM_UNDEFINED(sb, sizeof(*sb))); 760 sb->next = stream->block; 761 sb->offset = block; 762 VG(sb->_vg_ptr = NULL); 763 VG(VALGRIND_MAKE_MEM_NOACCESS(sb, stream->block_pool->block_size)); 764 765 stream->block = sb; 766 stream->start = block; 767 stream->next = block + sizeof(*sb); 768 stream->end = block + stream->block_pool->block_size; 769 770 state.offset = align_u32(stream->next, alignment); 771 assert(state.offset + size <= stream->end); 772 } 773 774 assert(state.offset > stream->start); 775 state.map = (void *)sb + (state.offset - stream->start); 776 state.alloc_size = size; 777 778 #ifdef HAVE_VALGRIND 779 void *vg_ptr = VG_NOACCESS_READ(&sb->_vg_ptr); 780 if (vg_ptr == NULL) { 781 vg_ptr = state.map; 782 VG_NOACCESS_WRITE(&sb->_vg_ptr, vg_ptr); 783 VALGRIND_MEMPOOL_ALLOC(stream, vg_ptr, size); 784 } else { 785 void *state_end = state.map + state.alloc_size; 786 /* This only updates the mempool. The newly allocated chunk is still 787 * marked as NOACCESS. */ 788 VALGRIND_MEMPOOL_CHANGE(stream, vg_ptr, vg_ptr, state_end - vg_ptr); 789 /* Mark the newly allocated chunk as undefined */ 790 VALGRIND_MAKE_MEM_UNDEFINED(state.map, state.alloc_size); 791 } 792 #endif 793 794 stream->next = state.offset + size; 795 796 return state; 797 } 798 799 struct bo_pool_bo_link { 800 struct bo_pool_bo_link *next; 801 struct anv_bo bo; 802 }; 803 804 void 805 anv_bo_pool_init(struct anv_bo_pool *pool, struct anv_device *device) 806 { 807 pool->device = device; 808 memset(pool->free_list, 0, sizeof(pool->free_list)); 809 810 VG(VALGRIND_CREATE_MEMPOOL(pool, 0, false)); 811 } 812 813 void 814 anv_bo_pool_finish(struct anv_bo_pool *pool) 815 { 816 for (unsigned i = 0; i < ARRAY_SIZE(pool->free_list); i++) { 817 struct bo_pool_bo_link *link = PFL_PTR(pool->free_list[i]); 818 while (link != NULL) { 819 struct bo_pool_bo_link link_copy = VG_NOACCESS_READ(link); 820 821 anv_gem_munmap(link_copy.bo.map, link_copy.bo.size); 822 anv_gem_close(pool->device, link_copy.bo.gem_handle); 823 link = link_copy.next; 824 } 825 } 826 827 VG(VALGRIND_DESTROY_MEMPOOL(pool)); 828 } 829 830 VkResult 831 anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo, uint32_t size) 832 { 833 VkResult result; 834 835 const unsigned size_log2 = size < 4096 ? 12 : ilog2_round_up(size); 836 const unsigned pow2_size = 1 << size_log2; 837 const unsigned bucket = size_log2 - 12; 838 assert(bucket < ARRAY_SIZE(pool->free_list)); 839 840 void *next_free_void; 841 if (anv_ptr_free_list_pop(&pool->free_list[bucket], &next_free_void)) { 842 struct bo_pool_bo_link *next_free = next_free_void; 843 *bo = VG_NOACCESS_READ(&next_free->bo); 844 assert(bo->gem_handle); 845 assert(bo->map == next_free); 846 assert(size <= bo->size); 847 848 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); 849 850 return VK_SUCCESS; 851 } 852 853 struct anv_bo new_bo; 854 855 result = anv_bo_init_new(&new_bo, pool->device, pow2_size); 856 if (result != VK_SUCCESS) 857 return result; 858 859 assert(new_bo.size == pow2_size); 860 861 new_bo.map = anv_gem_mmap(pool->device, new_bo.gem_handle, 0, pow2_size, 0); 862 if (new_bo.map == NULL) { 863 anv_gem_close(pool->device, new_bo.gem_handle); 864 return vk_error(VK_ERROR_MEMORY_MAP_FAILED); 865 } 866 867 *bo = new_bo; 868 869 VG(VALGRIND_MEMPOOL_ALLOC(pool, bo->map, size)); 870 871 return VK_SUCCESS; 872 } 873 874 void 875 anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo_in) 876 { 877 /* Make a copy in case the anv_bo happens to be storred in the BO */ 878 struct anv_bo bo = *bo_in; 879 880 VG(VALGRIND_MEMPOOL_FREE(pool, bo.map)); 881 882 struct bo_pool_bo_link *link = bo.map; 883 VG_NOACCESS_WRITE(&link->bo, bo); 884 885 assert(util_is_power_of_two(bo.size)); 886 const unsigned size_log2 = ilog2_round_up(bo.size); 887 const unsigned bucket = size_log2 - 12; 888 assert(bucket < ARRAY_SIZE(pool->free_list)); 889 890 anv_ptr_free_list_push(&pool->free_list[bucket], link); 891 } 892 893 // Scratch pool 894 895 void 896 anv_scratch_pool_init(struct anv_device *device, struct anv_scratch_pool *pool) 897 { 898 memset(pool, 0, sizeof(*pool)); 899 } 900 901 void 902 anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool) 903 { 904 for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { 905 for (unsigned i = 0; i < 16; i++) { 906 struct anv_scratch_bo *bo = &pool->bos[i][s]; 907 if (bo->exists > 0) 908 anv_gem_close(device, bo->bo.gem_handle); 909 } 910 } 911 } 912 913 struct anv_bo * 914 anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool, 915 gl_shader_stage stage, unsigned per_thread_scratch) 916 { 917 if (per_thread_scratch == 0) 918 return NULL; 919 920 unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); 921 assert(scratch_size_log2 < 16); 922 923 struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage]; 924 925 /* We can use "exists" to shortcut and ignore the critical section */ 926 if (bo->exists) 927 return &bo->bo; 928 929 pthread_mutex_lock(&device->mutex); 930 931 __sync_synchronize(); 932 if (bo->exists) 933 return &bo->bo; 934 935 const struct anv_physical_device *physical_device = 936 &device->instance->physicalDevice; 937 const struct gen_device_info *devinfo = &physical_device->info; 938 939 /* WaCSScratchSize:hsw 940 * 941 * Haswell's scratch space address calculation appears to be sparse 942 * rather than tightly packed. The Thread ID has bits indicating which 943 * subslice, EU within a subslice, and thread within an EU it is. 944 * There's a maximum of two slices and two subslices, so these can be 945 * stored with a single bit. Even though there are only 10 EUs per 946 * subslice, this is stored in 4 bits, so there's an effective maximum 947 * value of 16 EUs. Similarly, although there are only 7 threads per EU, 948 * this is stored in a 3 bit number, giving an effective maximum value 949 * of 8 threads per EU. 950 * 951 * This means that we need to use 16 * 8 instead of 10 * 7 for the 952 * number of threads per subslice. 953 */ 954 const unsigned subslices = MAX2(physical_device->subslice_total, 1); 955 const unsigned scratch_ids_per_subslice = 956 device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads; 957 958 uint32_t max_threads[] = { 959 [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, 960 [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, 961 [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, 962 [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, 963 [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, 964 [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices, 965 }; 966 967 uint32_t size = per_thread_scratch * max_threads[stage]; 968 969 anv_bo_init_new(&bo->bo, device, size); 970 971 /* Set the exists last because it may be read by other threads */ 972 __sync_synchronize(); 973 bo->exists = true; 974 975 pthread_mutex_unlock(&device->mutex); 976 977 return &bo->bo; 978 } 979